summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/audit.h13
-rw-r--r--kernel/audit_fsnotify.c4
-rw-r--r--kernel/audit_watch.c12
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c4
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arena.c28
-rw-r--r--kernel/bpf/arraymap.c6
-rw-r--r--kernel/bpf/backtrack.c934
-rw-r--r--kernel/bpf/bloom_filter.c2
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c11
-rw-r--r--kernel/bpf/bpf_inode_storage.c11
-rw-r--r--kernel/bpf/bpf_insn_array.c2
-rw-r--r--kernel/bpf/bpf_local_storage.c174
-rw-r--r--kernel/bpf/bpf_lsm.c7
-rw-r--r--kernel/bpf/bpf_struct_ops.c7
-rw-r--r--kernel/bpf/bpf_task_storage.c11
-rw-r--r--kernel/bpf/btf.c110
-rw-r--r--kernel/bpf/cfg.c872
-rw-r--r--kernel/bpf/check_btf.c463
-rw-r--r--kernel/bpf/const_fold.c396
-rw-r--r--kernel/bpf/core.c276
-rw-r--r--kernel/bpf/cpumap.c22
-rw-r--r--kernel/bpf/devmap.c52
-rw-r--r--kernel/bpf/fixups.c2570
-rw-r--r--kernel/bpf/hashtab.c92
-rw-r--r--kernel/bpf/helpers.c260
-rw-r--r--kernel/bpf/liveness.c2410
-rw-r--r--kernel/bpf/local_storage.c4
-rw-r--r--kernel/bpf/log.c62
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/memalloc.c91
-rw-r--r--kernel/bpf/offload.c10
-rw-r--r--kernel/bpf/states.c1563
-rw-r--r--kernel/bpf/syscall.c69
-rw-r--r--kernel/bpf/task_iter.c151
-rw-r--r--kernel/bpf/tnum.c48
-rw-r--r--kernel/bpf/trampoline.c4
-rw-r--r--kernel/bpf/verifier.c10078
-rw-r--r--kernel/cgroup/cgroup-internal.h6
-rw-r--r--kernel/cgroup/cgroup.c203
-rw-r--r--kernel/cgroup/cpuset.c258
-rw-r--r--kernel/cgroup/dmem.c5
-rw-r--r--kernel/cgroup/rdma.c2
-rw-r--r--kernel/configs/debug.config1
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/crash_dump_dm_crypt.c26
-rw-r--r--kernel/crash_reserve.c2
-rw-r--r--kernel/dma/Kconfig3
-rw-r--r--kernel/dma/coherent.c19
-rw-r--r--kernel/dma/contiguous.c144
-rw-r--r--kernel/dma/debug.c10
-rw-r--r--kernel/dma/direct.c23
-rw-r--r--kernel/dma/direct.h40
-rw-r--r--kernel/dma/map_benchmark.c246
-rw-r--r--kernel/dma/mapping.c25
-rw-r--r--kernel/dma/swiotlb.c47
-rw-r--r--kernel/entry/common.c109
-rw-r--r--kernel/events/core.c147
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/exit.c17
-rw-r--r--kernel/fork.c120
-rw-r--r--kernel/futex/Makefile2
-rw-r--r--kernel/futex/core.c11
-rw-r--r--kernel/futex/futex.h17
-rw-r--r--kernel/futex/pi.c12
-rw-r--r--kernel/futex/syscalls.c8
-rw-r--r--kernel/futex/waitwake.c4
-rw-r--r--kernel/hung_task.c106
-rw-r--r--kernel/irq/affinity.c7
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/matrix.c2
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/jump_label.c9
-rw-r--r--kernel/kcsan/kcsan_test.c2
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/ksysfs.c9
-rw-r--r--kernel/kthread.c41
-rw-r--r--kernel/liveupdate/kexec_handover.c679
-rw-r--r--kernel/liveupdate/kexec_handover_debugfs.c26
-rw-r--r--kernel/liveupdate/kexec_handover_internal.h3
-rw-r--r--kernel/liveupdate/luo_core.c12
-rw-r--r--kernel/liveupdate/luo_file.c41
-rw-r--r--kernel/liveupdate/luo_session.c9
-rw-r--r--kernel/locking/Makefile5
-rw-r--r--kernel/locking/mutex-debug.c9
-rw-r--r--kernel/locking/mutex.c122
-rw-r--r--kernel/locking/mutex.h7
-rw-r--r--kernel/locking/rtmutex.c18
-rw-r--r--kernel/locking/rtmutex_api.c2
-rw-r--r--kernel/locking/rtmutex_common.h27
-rw-r--r--kernel/locking/rwbase_rt.c1
-rw-r--r--kernel/locking/rwsem.c113
-rw-r--r--kernel/locking/semaphore.c41
-rw-r--r--kernel/locking/spinlock.c12
-rw-r--r--kernel/locking/ww_mutex.h65
-rw-r--r--kernel/locking/ww_rt_mutex.c1
-rw-r--r--kernel/module/Kconfig23
-rw-r--r--kernel/module/internal.h4
-rw-r--r--kernel/module/main.c205
-rw-r--r--kernel/module/signing.c4
-rw-r--r--kernel/module_signature.c2
-rw-r--r--kernel/nscommon.c6
-rw-r--r--kernel/nsproxy.c28
-rw-r--r--kernel/nstree.c29
-rw-r--r--kernel/padata.c130
-rw-r--r--kernel/panic.c47
-rw-r--r--kernel/params.c29
-rw-r--r--kernel/pid.c37
-rw-r--r--kernel/pid_namespace.c9
-rw-r--r--kernel/power/em_netlink.c2
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/snapshot.c11
-rw-r--r--kernel/power/user.c7
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/rcu/Kconfig.debug11
-rw-r--r--kernel/rcu/rcu.h13
-rw-r--r--kernel/rcu/rcuscale.c78
-rw-r--r--kernel/rcu/rcutorture.c64
-rw-r--r--kernel/rcu/refscale.c51
-rw-r--r--kernel/rcu/srcutiny.c19
-rw-r--r--kernel/rcu/srcutree.c211
-rw-r--r--kernel/rcu/tasks.h6
-rw-r--r--kernel/rcu/tree_nocb.h121
-rw-r--r--kernel/rcu/update.c22
-rw-r--r--kernel/resource.c33
-rw-r--r--kernel/rseq.c8
-rw-r--r--kernel/sched/core.c512
-rw-r--r--kernel/sched/cpufreq_schedutil.c5
-rw-r--r--kernel/sched/deadline.c96
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/ext.c4192
-rw-r--r--kernel/sched/ext.h4
-rw-r--r--kernel/sched/ext_idle.c237
-rw-r--r--kernel/sched/ext_idle.h2
-rw-r--r--kernel/sched/ext_internal.h460
-rw-r--r--kernel/sched/fair.c698
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c39
-rw-r--r--kernel/sched/isolation.c4
-rw-r--r--kernel/sched/rt.c64
-rw-r--r--kernel/sched/sched.h117
-rw-r--r--kernel/sched/syscalls.c46
-rw-r--r--kernel/sched/topology.c279
-rw-r--r--kernel/signal.c21
-rw-r--r--kernel/smp.c62
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/sys.c30
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/taskstats.c1
-rw-r--r--kernel/time/.kunitconfig2
-rw-r--r--kernel/time/Kconfig32
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c14
-rw-r--r--kernel/time/clockevents.c71
-rw-r--r--kernel/time/clocksource-wdtest.c268
-rw-r--r--kernel/time/clocksource.c805
-rw-r--r--kernel/time/hrtimer.c1128
-rw-r--r--kernel/time/jiffies.c3
-rw-r--r--kernel/time/namespace.c203
-rw-r--r--kernel/time/namespace_internal.h28
-rw-r--r--kernel/time/namespace_vdso.c160
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c1
-rw-r--r--kernel/time/tick-broadcast.c8
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-sched.c30
-rw-r--r--kernel/time/time.c21
-rw-r--r--kernel/time/timekeeping.c209
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/time/timer.c5
-rw-r--r--kernel/time/timer_list.c16
-rw-r--r--kernel/time/timer_migration.c4
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/Makefile60
-rw-r--r--kernel/trace/blktrace.c3
-rw-r--r--kernel/trace/bpf_trace.c8
-rw-r--r--kernel/trace/fprobe.c2
-rw-r--r--kernel/trace/ftrace.c42
-rw-r--r--kernel/trace/remote_test.c261
-rw-r--r--kernel/trace/remote_test_events.h10
-rw-r--r--kernel/trace/ring_buffer.c410
-rw-r--r--kernel/trace/rv/Kconfig18
-rw-r--r--kernel/trace/rv/Makefile3
-rw-r--r--kernel/trace/rv/monitors/deadline/Kconfig10
-rw-r--r--kernel/trace/rv/monitors/deadline/deadline.c44
-rw-r--r--kernel/trace/rv/monitors/deadline/deadline.h202
-rw-r--r--kernel/trace/rv/monitors/nomiss/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss.c293
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss.h123
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss_trace.h19
-rw-r--r--kernel/trace/rv/monitors/opid/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c111
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h86
-rw-r--r--kernel/trace/rv/monitors/opid/opid_trace.h4
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c8
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.h98
-rw-r--r--kernel/trace/rv/monitors/stall/Kconfig13
-rw-r--r--kernel/trace/rv/monitors/stall/stall.c150
-rw-r--r--kernel/trace/rv/monitors/stall/stall.h81
-rw-r--r--kernel/trace/rv/monitors/stall/stall_trace.h19
-rw-r--r--kernel/trace/rv/rv_trace.h67
-rw-r--r--kernel/trace/simple_ring_buffer.c517
-rw-r--r--kernel/trace/trace.c1459
-rw-r--r--kernel/trace/trace.h150
-rw-r--r--kernel/trace/trace_boot.c5
-rw-r--r--kernel/trace/trace_events.c167
-rw-r--r--kernel/trace/trace_events_hist.c29
-rw-r--r--kernel/trace/trace_events_synth.c4
-rw-r--r--kernel/trace/trace_events_trigger.c82
-rw-r--r--kernel/trace/trace_functions_graph.c19
-rw-r--r--kernel/trace/trace_kprobe.c11
-rw-r--r--kernel/trace/trace_osnoise.c64
-rw-r--r--kernel/trace/trace_output.c32
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/trace/trace_remote.c1384
-rw-r--r--kernel/trace/trace_snapshot.c1066
-rw-r--r--kernel/trace/trace_syscalls.c3
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/vmcore_info.c4
-rw-r--r--kernel/watchdog.c148
-rw-r--r--kernel/watchdog_buddy.c9
-rw-r--r--kernel/workqueue.c379
-rw-r--r--kernel/workqueue_internal.h1
229 files changed, 27343 insertions, 15266 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 1e19722c64c3..cbbf79d718cf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -244,7 +244,7 @@ static int acct_on(const char __user *name)
if (!S_ISREG(file_inode(file)->i_mode))
return -EACCES;
- /* Exclude kernel kernel internal filesystems. */
+ /* Exclude kernel internal filesystems. */
if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT))
return -EINVAL;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5a0216056524..e1d489bc2dff 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -355,8 +355,8 @@ void audit_panic(const char *message)
static inline int audit_rate_check(void)
{
- static unsigned long last_check = 0;
- static int messages = 0;
+ static unsigned long last_check;
+ static int messages;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
@@ -391,7 +391,7 @@ static inline int audit_rate_check(void)
*/
void audit_log_lost(const char *message)
{
- static unsigned long last_msg = 0;
+ static unsigned long last_msg;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
@@ -1295,6 +1295,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
+ if (s.mask & ~AUDIT_STATUS_ALL)
+ return -EINVAL;
if (s.mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(s.enabled);
if (err < 0)
diff --git a/kernel/audit.h b/kernel/audit.h
index 7c401729e21b..ac81fa02bcd7 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -76,7 +76,7 @@ struct audit_names {
int name_len; /* number of chars to log */
bool hidden; /* don't log this record */
- unsigned long ino;
+ u64 ino;
dev_t dev;
umode_t mode;
kuid_t uid;
@@ -225,9 +225,9 @@ extern int auditd_test_task(struct task_struct *task);
#define AUDIT_INODE_BUCKETS 32
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
-static inline int audit_hash_ino(u32 ino)
+static inline int audit_hash_ino(u64 ino)
{
- return (ino & (AUDIT_INODE_BUCKETS-1));
+ return ((u32)ino & (AUDIT_INODE_BUCKETS-1));
}
/* Indicates that audit should log the full pathname. */
@@ -277,16 +277,15 @@ extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
-extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino,
- dev_t dev);
+extern int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev);
extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
char *pathname, int len);
extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
extern void audit_remove_mark_rule(struct audit_krule *krule);
-extern int audit_mark_compare(struct audit_fsnotify_mark *mark,
- unsigned long ino, dev_t dev);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino,
+ dev_t dev);
extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
extern int audit_exe_compare(struct task_struct *tsk,
struct audit_fsnotify_mark *mark);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index a4401f651060..711454f9f724 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -25,7 +25,7 @@
*/
struct audit_fsnotify_mark {
dev_t dev; /* associated superblock device */
- unsigned long ino; /* associated inode number */
+ u64 ino; /* associated inode number */
char *path; /* insertion path */
struct fsnotify_mark mark; /* fsnotify mark on the inode */
struct audit_krule *rule;
@@ -57,7 +57,7 @@ char *audit_mark_path(struct audit_fsnotify_mark *mark)
return mark->path;
}
-int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
+int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino, dev_t dev)
{
if (mark->ino == AUDIT_INO_UNSET)
return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 096faac2435c..33577f0f54ef 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -37,7 +37,7 @@ struct audit_watch {
refcount_t count; /* reference count */
dev_t dev; /* associated superblock device */
char *path; /* insertion path */
- unsigned long ino; /* associated inode number */
+ u64 ino; /* associated inode number */
struct audit_parent *parent; /* associated parent */
struct list_head wlist; /* entry in parent->watches list */
struct list_head rules; /* anchor for krule->rlist */
@@ -125,7 +125,7 @@ char *audit_watch_path(struct audit_watch *watch)
return watch->path;
}
-int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
+int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev)
{
return (watch->ino != AUDIT_INO_UNSET) &&
(watch->ino == ino) &&
@@ -244,7 +244,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
/* Update inode info in audit rules based on filesystem event. */
static void audit_update_watch(struct audit_parent *parent,
const struct qstr *dname, dev_t dev,
- unsigned long ino, unsigned invalidating)
+ u64 ino, unsigned invalidating)
{
struct audit_watch *owatch, *nwatch, *nextw;
struct audit_krule *r, *nextr;
@@ -285,7 +285,7 @@ static void audit_update_watch(struct audit_parent *parent,
list_del(&oentry->rule.list);
audit_panic("error updating watch, removing");
} else {
- int h = audit_hash_ino((u32)ino);
+ int h = audit_hash_ino(ino);
/*
* nentry->rule.watch == oentry->rule.watch so
@@ -439,7 +439,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
audit_add_to_parent(krule, parent);
- h = audit_hash_ino((u32)watch->ino);
+ h = audit_hash_ino(watch->ino);
*list = &audit_inode_hash[h];
error:
path_put(&parent_path);
@@ -527,7 +527,7 @@ int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
{
struct file *exe_file;
- unsigned long ino;
+ u64 ino;
dev_t dev;
/* only do exe filtering if we are recording @current events/records */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6e3abbf08e3d..093425123f6c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -303,8 +303,7 @@ exit_err:
return ERR_PTR(err);
}
-static u32 audit_ops[] =
-{
+static u32 audit_ops[] = {
[Audit_equal] = AUDIT_EQUAL,
[Audit_not_equal] = AUDIT_NOT_EQUAL,
[Audit_bitmask] = AUDIT_BIT_MASK,
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f6af6a8f68c4..ab54fccba215 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -886,7 +886,7 @@ static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
struct audit_context *ctx)
{
- int h = audit_hash_ino((u32)n->ino);
+ int h = audit_hash_ino(n->ino);
struct list_head *list = &audit_inode_hash[h];
return __audit_filter_op(tsk, ctx, list, n, ctx->major);
@@ -1534,7 +1534,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
audit_log_format(ab, " name=(null)");
if (n->ino != AUDIT_INO_UNSET)
- audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
+ audit_log_format(ab, " inode=%llu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
n->ino,
MAJOR(n->dev),
MINOR(n->dev),
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 79cf22860a99..399007b67a92 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,11 +6,12 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
+obj-$(CONFIG_BPF_SYSCALL) += fixups.o cfg.o states.o backtrack.o check_btf.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 144f30e740e8..802656c6fd3c 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key,
return -EOPNOTSUPP;
}
-static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf,
const struct btf_type *key_type, const struct btf_type *value_type)
{
return 0;
@@ -341,6 +341,16 @@ static void arena_vm_open(struct vm_area_struct *vma)
refcount_inc(&vml->mmap_count);
}
+static int arena_vm_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ return -EINVAL;
+}
+
+static int arena_vm_mremap(struct vm_area_struct *vma)
+{
+ return -EINVAL;
+}
+
static void arena_vm_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
@@ -417,6 +427,8 @@ out_unlock_sigsegv:
static const struct vm_operations_struct arena_vm_ops = {
.open = arena_vm_open,
+ .may_split = arena_vm_may_split,
+ .mremap = arena_vm_mremap,
.close = arena_vm_close,
.fault = arena_vm_fault,
};
@@ -486,10 +498,11 @@ static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
arena->user_vm_end = vma->vm_end;
/*
* bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
- * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
- * potential change of user_vm_start.
+ * clears VM_MAYEXEC. Set VM_DONTEXPAND to avoid potential change
+ * of user_vm_start. Set VM_DONTCOPY to prevent arena VMA from
+ * being copied into the child process on fork.
*/
- vm_flags_set(vma, VM_DONTEXPAND);
+ vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY);
vma->vm_ops = &arena_vm_ops;
return 0;
}
@@ -549,6 +562,10 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
u32 uaddr32;
int ret, i;
+ if (node_id != NUMA_NO_NODE &&
+ ((unsigned int)node_id >= nr_node_ids || !node_online(node_id)))
+ return 0;
+
if (page_cnt > page_cnt_max)
return 0;
@@ -656,8 +673,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
guard(mutex)(&arena->lock);
/* iterate link list under lock */
list_for_each_entry(vml, &arena->vma_list, head)
- zap_page_range_single(vml->vma, uaddr,
- PAGE_SIZE * page_cnt, NULL);
+ zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt);
}
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 26763df6134a..5e25e0353509 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
-static int array_map_check_btf(const struct bpf_map *map,
+static int array_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
@@ -1015,8 +1015,10 @@ static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- for (i = 0; i < array->map.max_entries; i++)
+ for (i = 0; i < array->map.max_entries; i++) {
__fd_array_map_delete_elem(map, &i, need_defer);
+ cond_resched();
+ }
}
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c
new file mode 100644
index 000000000000..854731dc93fe
--- /dev/null
+++ b/kernel/bpf/backtrack.c
@@ -0,0 +1,934 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+#include <linux/bitmap.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+/* for any branch, call, exit record the history of jmps in the given state */
+int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
+{
+ u32 cnt = cur->jmp_history_cnt;
+ struct bpf_jmp_history_entry *p;
+ size_t alloc_size;
+
+ /* combine instruction flags if we already recorded this instruction */
+ if (env->cur_hist_ent) {
+ /* atomic instructions push insn_flags twice, for READ and
+ * WRITE sides, but they should agree on stack slot
+ */
+ verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ env, "insn history: insn_idx %d cur flags %x new flags %x",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ env->cur_hist_ent->flags |= insn_flags;
+ verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
+ "insn history: insn_idx %d linked_regs: %#llx",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
+ env->cur_hist_ent->linked_regs = linked_regs;
+ return 0;
+ }
+
+ cnt++;
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!p)
+ return -ENOMEM;
+ cur->jmp_history = p;
+
+ p = &cur->jmp_history[cnt - 1];
+ p->idx = env->insn_idx;
+ p->prev_idx = env->prev_insn_idx;
+ p->flags = insn_flags;
+ p->linked_regs = linked_regs;
+ cur->jmp_history_cnt = cnt;
+ env->cur_hist_ent = p;
+
+ return 0;
+}
+
+static bool is_atomic_load_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ;
+}
+
+static bool is_atomic_fetch_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm & BPF_FETCH);
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+ return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+ return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
+/* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
+ */
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+ u32 *history)
+{
+ u32 cnt = *history;
+
+ if (i == st->first_insn_idx) {
+ if (cnt == 0)
+ return -ENOENT;
+ if (cnt == 1 && st->jmp_history[0].idx == i)
+ return -ENOENT;
+ }
+
+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
+ i = st->jmp_history[cnt - 1].prev_idx;
+ (*history)--;
+ } else {
+ i--;
+ }
+ return i;
+}
+
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+ u32 hist_end, int insn_idx)
+{
+ if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+ return &st->jmp_history[hist_end - 1];
+ return NULL;
+}
+
+static inline void bt_init(struct backtrack_state *bt, u32 frame)
+{
+ bt->frame = frame;
+}
+
+static inline void bt_reset(struct backtrack_state *bt)
+{
+ struct bpf_verifier_env *env = bt->env;
+
+ memset(bt, 0, sizeof(*bt));
+ bt->env = env;
+}
+
+static inline u32 bt_empty(struct backtrack_state *bt)
+{
+ u64 mask = 0;
+ int i;
+
+ for (i = 0; i <= bt->frame; i++)
+ mask |= bt->reg_masks[i] | bt->stack_masks[i];
+
+ return mask == 0;
+}
+
+static inline int bt_subprog_enter(struct backtrack_state *bt)
+{
+ if (bt->frame == MAX_CALL_FRAMES - 1) {
+ verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
+ return -EFAULT;
+ }
+ bt->frame++;
+ return 0;
+}
+
+static inline int bt_subprog_exit(struct backtrack_state *bt)
+{
+ if (bt->frame == 0) {
+ verifier_bug(bt->env, "subprog exit from frame 0");
+ return -EFAULT;
+ }
+ bt->frame--;
+ return 0;
+}
+
+static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] &= ~(1 << reg);
+}
+
+static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
+{
+ bpf_bt_set_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_clear_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] &= ~(1ull << slot);
+}
+
+static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->reg_masks[frame];
+}
+
+static inline u32 bt_reg_mask(struct backtrack_state *bt)
+{
+ return bt->reg_masks[bt->frame];
+}
+
+static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->stack_masks[frame];
+}
+
+static inline u64 bt_stack_mask(struct backtrack_state *bt)
+{
+ return bt->stack_masks[bt->frame];
+}
+
+static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
+{
+ return bt->reg_masks[bt->frame] & (1 << reg);
+}
+
+
+/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
+static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
+void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+
+
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ *
+ * @idx is an index of the instruction we are currently processing;
+ * @subseq_idx is an index of the subsequent instruction that:
+ * - *would be* executed next, if jump history is viewed in forward order;
+ * - *was* processed previously during backtracking.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
+ struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+{
+ struct bpf_insn *insn = env->prog->insnsi + idx;
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u32 dreg = insn->dst_reg;
+ u32 sreg = insn->src_reg;
+ u32 spi, i, fr;
+
+ if (insn->code == 0)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
+ verbose(env, "mark_precise: frame%d: regs=%s ",
+ bt->frame, env->tmp_str_buf);
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ verbose(env, "stack=%s before ", env->tmp_str_buf);
+ verbose(env, "%d: ", idx);
+ bpf_verbose_insn(env, insn);
+ }
+
+ /* If there is a history record that some registers gained range at this insn,
+ * propagate precision marks to those registers, so that bt_is_reg_set()
+ * accounts for these registers.
+ */
+ bpf_bt_sync_linked_regs(bt, hist);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ /* sreg is reserved and unused
+ * dreg still need precision before this insn
+ */
+ return 0;
+ } else if (opcode == BPF_MOV) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg = sreg or dreg = (s8, s16, s32)sreg
+ * dreg needs precision after this insn
+ * sreg needs precision before this insn
+ */
+ bt_clear_reg(bt, dreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
+ } else {
+ /* dreg = K
+ * dreg needs precision after this insn.
+ * Corresponding register is already marked
+ * as precise=true in this verifier state.
+ * No further markings in parent are necessary
+ */
+ bt_clear_reg(bt, dreg);
+ }
+ } else {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg += sreg
+ * both dreg and sreg need precision
+ * before this insn
+ */
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
+ } /* else dreg += K
+ * dreg still needs precision before this insn
+ */
+ }
+ } else if (class == BPF_LDX ||
+ is_atomic_load_insn(insn) ||
+ is_atomic_fetch_insn(insn)) {
+ u32 load_reg = dreg;
+
+ /*
+ * Atomic fetch operation writes the old value into
+ * a register (sreg or r0) and if it was tracked for
+ * precision, propagate to the stack slot like we do
+ * in regular ldx.
+ */
+ if (is_atomic_fetch_insn(insn))
+ load_reg = insn->imm == BPF_CMPXCHG ?
+ BPF_REG_0 : sreg;
+
+ if (!bt_is_reg_set(bt, load_reg))
+ return 0;
+ bt_clear_reg(bt, load_reg);
+
+ /* scalars can only be spilled into stack w/o losing precision.
+ * Load from any other memory can be zero extended.
+ * The desire to keep that precision is already indicated
+ * by 'precise' mark in corresponding register of this state.
+ * No further tracking necessary.
+ */
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
+ return 0;
+ /* dreg = *(u64 *)[fp - off] was a fill from the stack.
+ * that [fp - off] slot contains scalar that needs to be
+ * tracked with precision
+ */
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ bpf_bt_set_frame_slot(bt, fr, spi);
+ } else if (class == BPF_STX || class == BPF_ST) {
+ if (bt_is_reg_set(bt, dreg))
+ /* stx & st shouldn't be using _scalar_ dst_reg
+ * to access memory. It means backtracking
+ * encountered a case of pointer subtraction.
+ */
+ return -ENOTSUPP;
+ /* scalars can only be spilled into stack */
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
+ return 0;
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ if (!bt_is_frame_slot_set(bt, fr, spi))
+ return 0;
+ bt_clear_frame_slot(bt, fr, spi);
+ if (class == BPF_STX)
+ bt_set_reg(bt, sreg);
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
+ if (bpf_pseudo_call(insn)) {
+ int subprog_insn_idx, subprog;
+
+ subprog_insn_idx = idx + insn->imm + 1;
+ subprog = bpf_find_subprog(env, subprog_insn_idx);
+ if (subprog < 0)
+ return -EFAULT;
+
+ if (bpf_subprog_is_global(env, subprog)) {
+ /* check that jump history doesn't have any
+ * extra instructions from subprog; the next
+ * instruction after call to global subprog
+ * should be literally next instruction in
+ * caller program
+ */
+ verifier_bug_if(idx + 1 != subseq_idx, env,
+ "extra insn from subprog");
+ /* r1-r5 are invalidated after subprog call,
+ * so for global func call it shouldn't be set
+ * anymore
+ */
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verifier_bug(env, "global subprog unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ /* global subprog always sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ return 0;
+ } else {
+ /* static subprog call instruction, which
+ * means that we are exiting current subprog,
+ * so only r1-r5 could be still requested as
+ * precise, r0 and r6-r10 or any stack slot in
+ * the current frame should be zero by now
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verifier_bug(env, "static subprog unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ /* we are now tracking register spills correctly,
+ * so any instance of leftover slots is a bug
+ */
+ if (bt_stack_mask(bt) != 0) {
+ verifier_bug(env,
+ "static subprog leftover stack slots %llx",
+ bt_stack_mask(bt));
+ return -EFAULT;
+ }
+ /* propagate r1-r5 to the caller */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (bt_is_reg_set(bt, i)) {
+ bt_clear_reg(bt, i);
+ bpf_bt_set_frame_reg(bt, bt->frame - 1, i);
+ }
+ }
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ }
+ } else if (bpf_is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+ /* exit from callback subprog to callback-calling helper or
+ * kfunc call. Use idx/subseq_idx check to discern it from
+ * straight line code backtracking.
+ * Unlike the subprog call handling above, we shouldn't
+ * propagate precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback subprogs
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verifier_bug(env, "callback unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ if (bt_stack_mask(bt) != 0) {
+ verifier_bug(env, "callback leftover stack slots %llx",
+ bt_stack_mask(bt));
+ return -EFAULT;
+ }
+ /* clear r1-r5 in callback subprog's mask */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ } else if (opcode == BPF_CALL) {
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+ return -ENOTSUPP;
+ /* regular helper call sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ /* if backtracking was looking for registers R1-R5
+ * they should have been found already.
+ */
+ verifier_bug(env, "backtracking call unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
+ && subseq_idx - idx != 1) {
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+ }
+ } else if (opcode == BPF_EXIT) {
+ bool r0_precise;
+
+ /* Backtracking to a nested function call, 'idx' is a part of
+ * the inner frame 'subseq_idx' is a part of the outer frame.
+ * In case of a regular function call, instructions giving
+ * precision to registers R1-R5 should have been found already.
+ * In case of a callback, it is ok to have R1-R5 marked for
+ * backtracking, as these registers are set by the function
+ * invoking callback.
+ */
+ if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verifier_bug(env, "backtracking exit unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+
+ /* BPF_EXIT in subprog or callback always returns
+ * right after the call instruction, so by checking
+ * whether the instruction at subseq_idx-1 is subprog
+ * call or not we can distinguish actual exit from
+ * *subprog* from exit from *callback*. In the former
+ * case, we need to propagate r0 precision, if
+ * necessary. In the former we never do that.
+ */
+ r0_precise = subseq_idx - 1 >= 0 &&
+ bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
+ bt_is_reg_set(bt, BPF_REG_0);
+
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+
+ if (r0_precise)
+ bt_set_reg(bt, BPF_REG_0);
+ /* r6-r9 and stack slots will stay set in caller frame
+ * bitmasks until we return back from callee(s)
+ */
+ return 0;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
+ bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
+ bt_set_reg(bt, dreg);
+ } else if (BPF_SRC(insn->code) == BPF_K) {
+ /* dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
+ }
+ } else if (class == BPF_LD) {
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ bt_clear_reg(bt, dreg);
+ /* It's ld_imm64 or ld_abs or ld_ind.
+ * For ld_imm64 no further tracking of precision
+ * into parent is necessary
+ */
+ if (mode == BPF_IND || mode == BPF_ABS)
+ /* to be analyzed */
+ return -ENOTSUPP;
+ }
+ /* Propagate precision marks to linked registers, to account for
+ * registers marked as precise in this function.
+ */
+ bpf_bt_sync_linked_regs(bt, hist);
+ return 0;
+}
+
+/* the scalar precision tracking algorithm:
+ * . at the start all registers have precise=false.
+ * . scalar ranges are tracked as normal through alu and jmp insns.
+ * . once precise value of the scalar register is used in:
+ * . ptr + scalar alu
+ * . if (scalar cond K|scalar)
+ * . helper_call(.., scalar, ...) where ARG_CONST is expected
+ * backtrack through the verifier states and mark all registers and
+ * stack slots with spilled constants that these scalar registers
+ * should be precise.
+ * . during state pruning two registers (or spilled stack slots)
+ * are equivalent if both are not precise.
+ *
+ * Note the verifier cannot simply walk register parentage chain,
+ * since many different registers and stack slots could have been
+ * used to compute single precise scalar.
+ *
+ * The approach of starting with precise=true for all registers and then
+ * backtrack to mark a register as not precise when the verifier detects
+ * that program doesn't care about specific value (e.g., when helper
+ * takes register as ARG_ANYTHING parameter) is not safe.
+ *
+ * It's ok to walk single parentage chain of the verifier states.
+ * It's possible that this backtracking will go all the way till 1st insn.
+ * All other branches will be explored for needing precision later.
+ *
+ * The backtracking needs to deal with cases like:
+ * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
+ * r9 -= r8
+ * r5 = r9
+ * if r5 > 0x79f goto pc+7
+ * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
+ * r5 += 1
+ * ...
+ * call bpf_perf_event_output#25
+ * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
+ *
+ * and this case:
+ * r6 = 1
+ * call foo // uses callee's r6 inside to compute r0
+ * r0 += r6
+ * if r0 == 0 goto
+ *
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
+ *
+ * Also if parent's curframe > frame where backtracking started,
+ * the verifier need to mark registers in both frames, otherwise callees
+ * may incorrectly prune callers. This is similar to
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
+ *
+ * For now backtracking falls back into conservative marking.
+ */
+void bpf_mark_all_scalars_precise(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
+ st->curframe);
+ }
+
+ /* big hammer: mark all scalars precise in this path.
+ * pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
+ */
+ for (st = st->parent; st; st = st->parent) {
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE || reg->precise)
+ continue;
+ reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
+ i, j);
+ }
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!bpf_is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE || reg->precise)
+ continue;
+ reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
+ i, -(j + 1) * 8);
+ }
+ }
+ }
+ }
+}
+
+/*
+ * bpf_mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+int bpf_mark_chain_precision(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *starting_state,
+ int regno,
+ bool *changed)
+{
+ struct bpf_verifier_state *st = starting_state;
+ struct backtrack_state *bt = &env->bt;
+ int first_idx = st->first_insn_idx;
+ int last_idx = starting_state->insn_idx;
+ int subseq_idx = -1;
+ struct bpf_func_state *func;
+ bool tmp, skip_first = true;
+ struct bpf_reg_state *reg;
+ int i, fr, err;
+
+ if (!env->bpf_capable)
+ return 0;
+
+ changed = changed ?: &tmp;
+ /* set frame number from which we are starting to backtrack */
+ bt_init(bt, starting_state->curframe);
+
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[bt->frame];
+ if (regno >= 0) {
+ reg = &func->regs[regno];
+ if (reg->type != SCALAR_VALUE) {
+ verifier_bug(env, "backtracking misuse");
+ return -EFAULT;
+ }
+ bt_set_reg(bt, regno);
+ }
+
+ if (bt_empty(bt))
+ return 0;
+
+ for (;;) {
+ DECLARE_BITMAP(mask, 64);
+ u32 history = st->jmp_history_cnt;
+ struct bpf_jmp_history_entry *hist;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
+ bt->frame, last_idx, first_idx, subseq_idx);
+ }
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ bt_stack_mask(bt) == 0 &&
+ (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
+ bitmap_from_u64(mask, bt_reg_mask(bt));
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ bt_clear_reg(bt, i);
+ if (reg->type == SCALAR_VALUE) {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+ return 0;
+ }
+
+ verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
+ return -EFAULT;
+ }
+
+ for (i = last_idx;;) {
+ if (skip_first) {
+ err = 0;
+ skip_first = false;
+ } else {
+ hist = get_jmp_hist_entry(st, history, i);
+ err = backtrack_insn(env, i, subseq_idx, hist, bt);
+ }
+ if (err == -ENOTSUPP) {
+ bpf_mark_all_scalars_precise(env, starting_state);
+ bt_reset(bt);
+ return 0;
+ } else if (err) {
+ return err;
+ }
+ if (bt_empty(bt))
+ /* Found assignment(s) into tracked register in this state.
+ * Since this state is already marked, just return.
+ * Nothing to be tracked further in the parent state.
+ */
+ return 0;
+ subseq_idx = i;
+ i = get_prev_insn_idx(st, i, &history);
+ if (i == -ENOENT)
+ break;
+ if (i >= env->prog->len) {
+ /* This can happen if backtracking reached insn 0
+ * and there are still reg_mask or stack_mask
+ * to backtrack.
+ * It means the backtracking missed the spot where
+ * particular register was initialized with a constant.
+ */
+ verifier_bug(env, "backtracking idx %d", i);
+ return -EFAULT;
+ }
+ }
+ st = st->parent;
+ if (!st)
+ break;
+
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ bt_clear_frame_reg(bt, fr, i);
+ continue;
+ }
+ if (reg->precise) {
+ bt_clear_frame_reg(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
+ env, "stack slot %d, total slots %d",
+ i, func->allocated_stack / BPF_REG_SIZE))
+ return -EFAULT;
+
+ if (!bpf_is_spilled_scalar_reg(&func->stack[i])) {
+ bt_clear_frame_slot(bt, fr, i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->precise) {
+ bt_clear_frame_slot(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_reg_mask(bt, fr));
+ verbose(env, "mark_precise: frame%d: parent state regs=%s ",
+ fr, env->tmp_str_buf);
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_stack_mask(bt, fr));
+ verbose(env, "stack=%s: ", env->tmp_str_buf);
+ print_verifier_state(env, st, fr, true);
+ }
+ }
+
+ if (bt_empty(bt))
+ return 0;
+
+ subseq_idx = first_idx;
+ last_idx = st->last_insn_idx;
+ first_idx = st->first_insn_idx;
+ }
+
+ /* if we still have requested precise regs or slots, we missed
+ * something (e.g., stack access through non-r10 register), so
+ * fallback to marking all precise
+ */
+ if (!bt_empty(bt)) {
+ bpf_mark_all_scalars_precise(env, starting_state);
+ bt_reset(bt);
+ }
+
+ return 0;
+}
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 35e1ddca74d2..b73336c976b7 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
}
-static int bloom_map_check_btf(const struct bpf_map *map,
+static int bloom_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index c2a2ead1f466..c76e9b0fabba 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -76,7 +76,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
return PTR_ERR(cgroup);
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, map_flags, false, GFP_ATOMIC);
+ value, map_flags, false);
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -114,7 +114,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache);
}
static void cgroup_storage_map_free(struct bpf_map *map)
@@ -122,9 +122,8 @@ static void cgroup_storage_map_free(struct bpf_map *map)
bpf_local_storage_map_free(map, &cgroup_cache);
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -143,7 +142,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, BPF_NOEXIST, false, gfp_flags);
+ value, BPF_NOEXIST, false);
out:
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e86734609f3d..0da8d923e39d 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -98,7 +98,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
- value, map_flags, false, GFP_ATOMIC);
+ value, map_flags, false);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -122,9 +122,8 @@ static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
return inode_storage_delete(file_inode(fd_file(f)), map);
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -150,7 +149,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
@@ -179,7 +178,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &inode_cache, false);
+ return bpf_local_storage_map_alloc(attr, &inode_cache);
}
static void inode_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
index c0286f25ca3c..a2f84afe6f7c 100644
--- a/kernel/bpf/bpf_insn_array.c
+++ b/kernel/bpf/bpf_insn_array.c
@@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
-static int insn_array_check_btf(const struct bpf_map *map,
+static int insn_array_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b28f07d3a0db..6fc6a4b672b5 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -68,25 +68,19 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, bool swap_uptrs)
{
struct bpf_local_storage_elem *selem;
if (mem_charge(smap, owner, smap->elem_size))
return NULL;
- if (smap->use_kmalloc_nolock) {
- selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
- __GFP_ZERO, NUMA_NO_NODE);
- } else {
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- gfp_flags | __GFP_NOWARN);
- }
+ selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+ __GFP_ZERO, NUMA_NO_NODE);
if (selem) {
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
atomic_set(&selem->state, 0);
- selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
@@ -102,46 +96,16 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
- /* If RCU Tasks Trace grace period implies RCU grace period, do
- * kfree(), else do kfree_rcu().
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
*/
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- if (rcu_trace_implies_rcu_gp())
- kfree(local_storage);
- else
- kfree_rcu(local_storage, rcu);
-}
-
-/* Handle use_kmalloc_nolock == false */
-static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- kfree_rcu(local_storage, rcu);
- else
- call_rcu_tasks_trace(&local_storage->rcu,
- __bpf_local_storage_free_trace_rcu);
-}
-
-static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
-{
- struct bpf_local_storage *local_storage;
-
- local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- kfree_nolock(local_storage);
-}
-
-static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
-{
- if (rcu_trace_implies_rcu_gp())
- bpf_local_storage_free_rcu(rcu);
- else
- call_rcu(rcu, bpf_local_storage_free_rcu);
+ kfree(local_storage);
}
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
@@ -150,13 +114,8 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
if (!local_storage)
return;
- if (!local_storage->use_kmalloc_nolock) {
- __bpf_local_storage_free(local_storage, reuse_now);
- return;
- }
-
if (reuse_now) {
- call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ kfree_rcu(local_storage, rcu);
return;
}
@@ -164,29 +123,7 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bpf_local_storage_free_trace_rcu);
}
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
-{
- struct bpf_local_storage_elem *selem;
-
- selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- if (rcu_trace_implies_rcu_gp())
- kfree(selem);
- else
- kfree_rcu(selem, rcu);
-}
-
-/* Handle use_kmalloc_nolock == false */
-static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- kfree_rcu(selem, rcu);
- else
- call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
-}
-
-static void bpf_selem_free_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map *smap;
@@ -195,20 +132,13 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
- if (smap) {
- migrate_disable();
+ if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- migrate_enable();
- }
- kfree_nolock(selem);
-}
-
-static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
-{
- if (rcu_trace_implies_rcu_gp())
- bpf_selem_free_rcu(rcu);
- else
- call_rcu(rcu, bpf_selem_free_rcu);
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
+ */
+ kfree(selem);
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
@@ -216,26 +146,12 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
{
struct bpf_local_storage_map *smap;
- smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
- if (!selem->use_kmalloc_nolock) {
- /*
- * No uptr will be unpin even when reuse_now == false since uptr
- * is only supported in task local storage, where
- * smap->use_kmalloc_nolock == true.
- */
+ if (reuse_now) {
if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- __bpf_selem_free(selem, reuse_now);
- return;
- }
-
- if (reuse_now) {
- /*
- * While it is okay to call bpf_obj_free_fields() that unpins uptr when
- * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
- */
- call_rcu(&selem->rcu, bpf_selem_free_rcu);
+ kfree_rcu(selem, rcu);
return;
}
@@ -389,6 +305,9 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
unsigned long flags;
int err;
+ if (in_nmi())
+ return -EOPNOTSUPP;
+
if (unlikely(!selem_linked_to_storage_lockless(selem)))
/* selem has already been unlinked from sk */
return 0;
@@ -490,6 +409,14 @@ static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
}
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
}
+ /*
+ * Highly unlikely scenario: memory leak
+ *
+ * When destroy() fails to acqurire local_storage->lock and initializes
+ * selem->local_storage to NULL before any racing map_free() sees the same
+ * selem, no one will free the local storage.
+ */
+ WARN_ON_ONCE(err && !in_map_free);
if (!err || !in_map_free)
RCU_INIT_POINTER(selem->local_storage, NULL);
}
@@ -548,8 +475,7 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem,
- gfp_t gfp_flags)
+ struct bpf_local_storage_elem *first_selem)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -561,12 +487,8 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->use_kmalloc_nolock)
- storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
- __GFP_ZERO, NUMA_NO_NODE);
- else
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- gfp_flags | __GFP_NOWARN);
+ storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+ __GFP_ZERO, NUMA_NO_NODE);
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -576,7 +498,6 @@ int bpf_local_storage_alloc(void *owner,
raw_res_spin_lock_init(&storage->lock);
storage->owner = owner;
storage->mem_charge = sizeof(*storage);
- storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
refcount_set(&storage->owner_refcnt, 1);
bpf_selem_link_storage_nolock(storage, first_selem);
@@ -624,7 +545,7 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, u64 map_flags, bool swap_uptrs)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
@@ -641,9 +562,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
return ERR_PTR(-EINVAL);
- if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
- return ERR_PTR(-EINVAL);
-
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
@@ -652,11 +570,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
+ err = bpf_local_storage_alloc(owner, smap, selem);
if (err) {
bpf_selem_free(selem, true);
mem_uncharge(smap, owner, smap->elem_size);
@@ -686,7 +604,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -797,7 +715,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+int bpf_local_storage_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
@@ -853,8 +771,7 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache,
- bool use_kmalloc_nolock)
+ struct bpf_local_storage_cache *cache)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -886,12 +803,6 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->elem_size = offsetof(struct bpf_local_storage_elem,
sdata.data[attr->value_size]);
- /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
- * preemptible context. Thus, enforce all storages to use
- * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
- */
- smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
-
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
@@ -958,10 +869,9 @@ restart:
*/
synchronize_rcu();
- if (smap->use_kmalloc_nolock) {
- rcu_barrier_tasks_trace();
- rcu_barrier();
- }
+ /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
+ rcu_barrier_tasks_trace();
+ rcu_barrier();
kvfree(smap->buckets);
bpf_map_area_free(smap);
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 0c4a0c8e6f70..c5c925f00202 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -359,8 +359,6 @@ BTF_ID(func, bpf_lsm_sb_umount)
BTF_ID(func, bpf_lsm_settime)
#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_inet_conn_established)
-
BTF_ID(func, bpf_lsm_socket_accept)
BTF_ID(func, bpf_lsm_socket_bind)
BTF_ID(func, bpf_lsm_socket_connect)
@@ -381,8 +379,9 @@ BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
-BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
+BTF_ID(func, bpf_lsm_bdev_alloc_security)
+BTF_ID(func, bpf_lsm_bdev_setintegrity)
BTF_SET_END(sleepable_lsm_hooks)
BTF_SET_START(untrusted_lsm_hooks)
@@ -395,6 +394,8 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
#endif /* CONFIG_SECURITY_NETWORK */
BTF_ID(func, bpf_lsm_task_free)
+BTF_ID(func, bpf_lsm_bdev_alloc_security)
+BTF_ID(func, bpf_lsm_bdev_free_security)
BTF_SET_END(untrusted_lsm_hooks)
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 05b366b821c3..521cb9d7e8c7 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -811,9 +811,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto reset_unlock;
}
- /* Poison pointer on error instead of return for backward compatibility */
- bpf_prog_assoc_struct_ops(prog, &st_map->map);
-
link = kzalloc_obj(*link, GFP_USER);
if (!link) {
bpf_prog_put(prog);
@@ -824,6 +821,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
&bpf_struct_ops_link_lops, prog, prog->expected_attach_type);
*plink++ = &link->link;
+ /* Poison pointer on error instead of return for backward compatibility */
+ bpf_prog_assoc_struct_ops(prog, &st_map->map);
+
ksym = kzalloc_obj(*ksym, GFP_USER);
if (!ksym) {
err = -ENOMEM;
@@ -906,6 +906,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
reset_unlock:
bpf_struct_ops_map_free_ksyms(st_map);
bpf_struct_ops_map_free_image(st_map);
+ bpf_struct_ops_map_dissoc_progs(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 605506792b5b..4b342be29eac 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -118,7 +118,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
- true, GFP_ATOMIC);
+ true);
err = PTR_ERR_OR_ZERO(sdata);
out:
@@ -165,9 +165,8 @@ out:
return err;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -184,7 +183,7 @@ BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
@@ -212,7 +211,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &task_cache, true);
+ return bpf_local_storage_map_alloc(attr, &task_cache);
}
static void task_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4872d2a6c42d..a62d78581207 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -270,6 +270,7 @@ struct btf {
struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
struct btf_struct_metas *struct_meta_tab;
struct btf_struct_ops_tab *struct_ops_tab;
+ struct btf_layout *layout;
/* split BTF support */
struct btf *base_btf;
@@ -1707,6 +1708,11 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env,
__btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+ if (hdr->hdr_len >= sizeof(struct btf_header) &&
+ btf_data_size >= hdr->hdr_len) {
+ __btf_verifier_log(log, "layout_off: %u\n", hdr->layout_off);
+ __btf_verifier_log(log, "layout_len: %u\n", hdr->layout_len);
+ }
__btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
}
@@ -1787,7 +1793,16 @@ static void btf_free_id(struct btf *btf)
* of the _bh() version.
*/
spin_lock_irqsave(&btf_idr_lock, flags);
- idr_remove(&btf_idr, btf->id);
+ if (btf->id) {
+ idr_remove(&btf_idr, btf->id);
+ /*
+ * Clear the id here to make this function idempotent, since it will get
+ * called a couple of times for module BTFs: on module unload, and then
+ * the final btf_put(). btf_alloc_id() starts IDs with 1, so we can use
+ * 0 as sentinel value.
+ */
+ WRITE_ONCE(btf->id, 0);
+ }
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
@@ -5517,7 +5532,8 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
start = btf->nohdr_data + hdr->str_off;
end = start + hdr->str_len;
- if (end != btf->data + btf->data_size) {
+ if (hdr->hdr_len < sizeof(struct btf_header) &&
+ end != btf->data + btf->data_size) {
btf_verifier_log(env, "String section is not at the end");
return -EINVAL;
}
@@ -5538,9 +5554,46 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
return 0;
}
+static int btf_parse_layout_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr = &env->btf->hdr;
+ struct btf *btf = env->btf;
+ void *start, *end;
+
+ if (hdr->hdr_len < sizeof(struct btf_header) ||
+ hdr->layout_len == 0)
+ return 0;
+
+ /* Layout section must align to 4 bytes */
+ if (hdr->layout_off & (sizeof(u32) - 1)) {
+ btf_verifier_log(env, "Unaligned layout_off");
+ return -EINVAL;
+ }
+ start = btf->nohdr_data + hdr->layout_off;
+ end = start + hdr->layout_len;
+
+ if (hdr->layout_len < sizeof(struct btf_layout)) {
+ btf_verifier_log(env, "Layout section is too small");
+ return -EINVAL;
+ }
+ if (hdr->layout_len % sizeof(struct btf_layout) != 0) {
+ btf_verifier_log(env, "layout_len is not multiple of %zu",
+ sizeof(struct btf_layout));
+ return -EINVAL;
+ }
+ if (end > btf->data + btf->data_size) {
+ btf_verifier_log(env, "Layout section is too big");
+ return -EINVAL;
+ }
+ btf->layout = start;
+
+ return 0;
+}
+
static const size_t btf_sec_info_offset[] = {
offsetof(struct btf_header, type_off),
offsetof(struct btf_header, str_off),
+ offsetof(struct btf_header, layout_off)
};
static int btf_sec_info_cmp(const void *a, const void *b)
@@ -5556,24 +5609,28 @@ static int btf_check_sec_info(struct btf_verifier_env *env,
{
struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
u32 total, expected_total, i;
+ u32 nr_secs = ARRAY_SIZE(btf_sec_info_offset);
const struct btf_header *hdr;
const struct btf *btf;
btf = env->btf;
hdr = &btf->hdr;
+ if (hdr->hdr_len < sizeof(struct btf_header) || hdr->layout_len == 0)
+ nr_secs--;
+
/* Populate the secs from hdr */
- for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
+ for (i = 0; i < nr_secs; i++)
secs[i] = *(struct btf_sec_info *)((void *)hdr +
btf_sec_info_offset[i]);
- sort(secs, ARRAY_SIZE(btf_sec_info_offset),
+ sort(secs, nr_secs,
sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
/* Check for gaps and overlap among sections */
total = 0;
expected_total = btf_data_size - hdr->hdr_len;
- for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
+ for (i = 0; i < nr_secs; i++) {
if (expected_total < secs[i].off) {
btf_verifier_log(env, "Invalid section offset");
return -EINVAL;
@@ -5929,6 +5986,10 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
if (err)
goto errout;
+ err = btf_parse_layout_sec(env);
+ if (err)
+ goto errout;
+
err = btf_parse_type_sec(env);
if (err)
goto errout;
@@ -6508,13 +6569,6 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
-{
- /* skip modifiers */
- t = btf_type_skip_modifiers(btf, t->type, NULL);
- return btf_type_is_void(t) || btf_type_is_int(t);
-}
-
u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
int off)
{
@@ -6903,10 +6957,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
/*
- * If it's a pointer to void, it's the same as scalar from the verifier
- * safety POV. Either way, no futher pointer walking is allowed.
+ * If it's a single or multilevel pointer, except a pointer
+ * to a structure, it's the same as scalar from the verifier
+ * safety POV. Multilevel pointers to structures are treated as
+ * scalars. The verifier lacks the context to infer the size of
+ * their target memory regions. Either way, no further pointer
+ * walking is allowed.
*/
- if (is_void_or_int_ptr(btf, t))
+ if (!btf_type_is_struct_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -7836,15 +7894,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
- /* check that function returns int, exception cb also requires this */
+ /* check that function is void or returns int, exception cb also requires this */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
+ if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) {
if (!is_global)
return -EINVAL;
bpf_log(log,
- "Global function %s() doesn't return scalar. Only those are supported.\n",
+ "Global function %s() return value not void or scalar. "
+ "Only those are supported.\n",
tname);
return -EINVAL;
}
@@ -8115,7 +8174,7 @@ static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct btf *btf = filp->private_data;
- seq_printf(m, "btf_id:\t%u\n", btf->id);
+ seq_printf(m, "btf_id:\t%u\n", READ_ONCE(btf->id));
}
#endif
@@ -8197,7 +8256,7 @@ int btf_get_info_by_fd(const struct btf *btf,
if (copy_from_user(&info, uinfo, info_copy))
return -EFAULT;
- info.id = btf->id;
+ info.id = READ_ONCE(btf->id);
ubtf = u64_to_user_ptr(info.btf);
btf_copy = min_t(u32, btf->data_size, info.btf_size);
if (copy_to_user(ubtf, btf->data, btf_copy))
@@ -8260,7 +8319,7 @@ int btf_get_fd_by_id(u32 id)
u32 btf_obj_id(const struct btf *btf)
{
- return btf->id;
+ return READ_ONCE(btf->id);
}
bool btf_is_kernel(const struct btf *btf)
@@ -8382,6 +8441,13 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
if (btf_mod->module != module)
continue;
+ /*
+ * For modules, we do the freeing of BTF IDR as soon as
+ * module goes away to disable BTF discovery, since the
+ * btf_try_get_module() on such BTFs will fail. This may
+ * be called again on btf_put(), but it's ok to do so.
+ */
+ btf_free_id(btf_mod->btf);
list_del(&btf_mod->list);
if (btf_mod->sysfs_attr)
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
@@ -9003,7 +9069,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
if (!t || !btf_type_is_ptr(t))
return -EINVAL;
- if (IS_ENABLED(CONFIG_CFI_CLANG)) {
+ if (IS_ENABLED(CONFIG_CFI)) {
/* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_void(t))
diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
new file mode 100644
index 000000000000..998f42a8189a
--- /dev/null
+++ b/kernel/bpf/cfg.c
@@ -0,0 +1,872 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+#include <linux/sort.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.peek()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+
+static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->changes_pkt_data = true;
+}
+
+static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->might_sleep = true;
+}
+
+/* 't' is an index of a call-site.
+ * 'w' is a callee entry point.
+ * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
+ * Rely on DFS traversal order and absence of recursive calls to guarantee that
+ * callee's change_pkt_data marks would be correct at that moment.
+ */
+static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
+{
+ struct bpf_subprog_info *caller, *callee;
+
+ caller = bpf_find_containing_subprog(env, t);
+ callee = bpf_find_containing_subprog(env, w);
+ caller->changes_pkt_data |= callee->changes_pkt_data;
+ caller->might_sleep |= callee->might_sleep;
+}
+
+enum {
+ DONE_EXPLORING = 0,
+ KEEP_EXPLORING = 1,
+};
+
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+
+ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+ return DONE_EXPLORING;
+
+ if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+ return DONE_EXPLORING;
+
+ if (w < 0 || w >= env->prog->len) {
+ verbose_linfo(env, t, "%d: ", t);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ if (e == BRANCH) {
+ /* mark branch target for state pruning */
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
+
+ if (insn_state[w] == 0) {
+ /* tree-edge */
+ insn_state[t] = DISCOVERED | e;
+ insn_state[w] = DISCOVERED;
+ if (env->cfg.cur_stack >= env->prog->len)
+ return -E2BIG;
+ insn_stack[env->cfg.cur_stack++] = w;
+ return KEEP_EXPLORING;
+ } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ if (env->bpf_capable)
+ return DONE_EXPLORING;
+ verbose_linfo(env, t, "%d: ", t);
+ verbose_linfo(env, w, "%d: ", w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
+ return -EINVAL;
+ } else if (insn_state[w] == EXPLORED) {
+ /* forward- or cross-edge */
+ insn_state[t] = DISCOVERED | e;
+ } else {
+ verifier_bug(env, "insn state internal bug");
+ return -EFAULT;
+ }
+ return DONE_EXPLORING;
+}
+
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+{
+ int ret, insn_sz;
+ int w;
+
+ insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+ ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ mark_prune_point(env, t + insn_sz);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + insn_sz);
+
+ if (visit_callee) {
+ w = t + insns[t].imm + 1;
+ mark_prune_point(env, t);
+ merge_callee_effects(env, t, w);
+ ret = push_insn(t, w, BRANCH, env);
+ }
+ return ret;
+}
+
+struct bpf_iarray *bpf_iarray_realloc(struct bpf_iarray *old, size_t n_elem)
+{
+ size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
+ struct bpf_iarray *new;
+
+ new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
+ if (!new) {
+ /* this is what callers always want, so simplify the call site */
+ kvfree(old);
+ return NULL;
+ }
+
+ new->cnt = n_elem;
+ return new;
+}
+
+static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
+{
+ struct bpf_insn_array_value *value;
+ u32 i;
+
+ for (i = start; i <= end; i++) {
+ value = map->ops->map_lookup_elem(map, &i);
+ /*
+ * map_lookup_elem of an array map will never return an error,
+ * but not checking it makes some static analysers to worry
+ */
+ if (IS_ERR(value))
+ return PTR_ERR(value);
+ else if (!value)
+ return -EINVAL;
+ items[i - start] = value->xlated_off;
+ }
+ return 0;
+}
+
+static int cmp_ptr_to_u32(const void *a, const void *b)
+{
+ return *(u32 *)a - *(u32 *)b;
+}
+
+static int sort_insn_array_uniq(u32 *items, int cnt)
+{
+ int unique = 1;
+ int i;
+
+ sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
+
+ for (i = 1; i < cnt; i++)
+ if (items[i] != items[unique - 1])
+ items[unique++] = items[i];
+
+ return unique;
+}
+
+/*
+ * sort_unique({map[start], ..., map[end]}) into off
+ */
+int bpf_copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
+{
+ u32 n = end - start + 1;
+ int err;
+
+ err = copy_insn_array(map, start, end, off);
+ if (err)
+ return err;
+
+ return sort_insn_array_uniq(off, n);
+}
+
+/*
+ * Copy all unique offsets from the map
+ */
+static struct bpf_iarray *jt_from_map(struct bpf_map *map)
+{
+ struct bpf_iarray *jt;
+ int err;
+ int n;
+
+ jt = bpf_iarray_realloc(NULL, map->max_entries);
+ if (!jt)
+ return ERR_PTR(-ENOMEM);
+
+ n = bpf_copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
+ if (n < 0) {
+ err = n;
+ goto err_free;
+ }
+ if (n == 0) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ jt->cnt = n;
+ return jt;
+
+err_free:
+ kvfree(jt);
+ return ERR_PTR(err);
+}
+
+/*
+ * Find and collect all maps which fit in the subprog. Return the result as one
+ * combined jump table in jt->items (allocated with kvcalloc)
+ */
+static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
+ int subprog_start, int subprog_end)
+{
+ struct bpf_iarray *jt = NULL;
+ struct bpf_map *map;
+ struct bpf_iarray *jt_cur;
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++) {
+ /*
+ * TODO (when needed): collect only jump tables, not static keys
+ * or maps for indirect calls
+ */
+ map = env->insn_array_maps[i];
+
+ jt_cur = jt_from_map(map);
+ if (IS_ERR(jt_cur)) {
+ kvfree(jt);
+ return jt_cur;
+ }
+
+ /*
+ * This is enough to check one element. The full table is
+ * checked to fit inside the subprog later in create_jt()
+ */
+ if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
+ u32 old_cnt = jt ? jt->cnt : 0;
+ jt = bpf_iarray_realloc(jt, old_cnt + jt_cur->cnt);
+ if (!jt) {
+ kvfree(jt_cur);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
+ }
+
+ kvfree(jt_cur);
+ }
+
+ if (!jt) {
+ verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
+ return ERR_PTR(-EINVAL);
+ }
+
+ jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
+ return jt;
+}
+
+static struct bpf_iarray *
+create_jt(int t, struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog;
+ int subprog_start, subprog_end;
+ struct bpf_iarray *jt;
+ int i;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ subprog_start = subprog->start;
+ subprog_end = (subprog + 1)->start;
+ jt = jt_from_subprog(env, subprog_start, subprog_end);
+ if (IS_ERR(jt))
+ return jt;
+
+ /* Check that the every element of the jump table fits within the given subprogram */
+ for (i = 0; i < jt->cnt; i++) {
+ if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
+ verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
+ t, subprog_start, subprog_end);
+ kvfree(jt);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return jt;
+}
+
+/* "conditional jump with N edges" */
+static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
+{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+ bool keep_exploring = false;
+ struct bpf_iarray *jt;
+ int i, w;
+
+ jt = env->insn_aux_data[t].jt;
+ if (!jt) {
+ jt = create_jt(t, env);
+ if (IS_ERR(jt))
+ return PTR_ERR(jt);
+
+ env->insn_aux_data[t].jt = jt;
+ }
+
+ mark_prune_point(env, t);
+ for (i = 0; i < jt->cnt; i++) {
+ w = jt->items[i];
+ if (w < 0 || w >= env->prog->len) {
+ verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ mark_jmp_point(env, w);
+
+ /* EXPLORED || DISCOVERED */
+ if (insn_state[w])
+ continue;
+
+ if (env->cfg.cur_stack >= env->prog->len)
+ return -E2BIG;
+
+ insn_stack[env->cfg.cur_stack++] = w;
+ insn_state[w] |= DISCOVERED;
+ keep_exploring = true;
+ }
+
+ return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
+}
+
+/*
+ * Instructions that can abnormally return from a subprog (tail_call
+ * upon success, ld_{abs,ind} upon load failure) have a hidden exit
+ * that the verifier must account for.
+ */
+static int visit_abnormal_return_insn(struct bpf_verifier_env *env, int t)
+{
+ struct bpf_subprog_info *subprog;
+ struct bpf_iarray *jt;
+
+ if (env->insn_aux_data[t].jt)
+ return 0;
+
+ jt = bpf_iarray_realloc(NULL, 2);
+ if (!jt)
+ return -ENOMEM;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ jt->items[0] = t + 1;
+ jt->items[1] = subprog->exit_idx;
+ env->insn_aux_data[t].jt = jt;
+ return 0;
+}
+
+/* Visits the instruction at index t and returns one of the following:
+ * < 0 - an error occurred
+ * DONE_EXPLORING - the instruction was fully explored
+ * KEEP_EXPLORING - there is still work to be done before it is fully explored
+ */
+static int visit_insn(int t, struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
+ int ret, off, insn_sz;
+
+ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
+
+ /* All non-branch instructions have a single fall-through edge. */
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+ BPF_CLASS(insn->code) != BPF_JMP32) {
+ if (BPF_CLASS(insn->code) == BPF_LD &&
+ (BPF_MODE(insn->code) == BPF_ABS ||
+ BPF_MODE(insn->code) == BPF_IND)) {
+ ret = visit_abnormal_return_insn(env, t);
+ if (ret)
+ return ret;
+ }
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ return push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_EXIT:
+ return DONE_EXPLORING;
+
+ case BPF_CALL:
+ if (bpf_is_async_callback_calling_insn(insn))
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
+ */
+ mark_prune_point(env, t);
+ /* For functions that invoke callbacks it is not known how many times
+ * callback would be called. Verifier models callback calling functions
+ * by repeatedly visiting callback bodies and returning to origin call
+ * instruction.
+ * In order to stop such iteration verifier needs to identify when a
+ * state identical some state from a previous iteration is reached.
+ * Check below forces creation of checkpoint before callback calling
+ * instruction to allow search for such identical states.
+ */
+ if (bpf_is_sync_callback_calling_insn(insn)) {
+ mark_calls_callback(env, t);
+ mark_force_checkpoint(env, t);
+ mark_prune_point(env, t);
+ mark_jmp_point(env, t);
+ }
+ if (bpf_helper_call(insn)) {
+ const struct bpf_func_proto *fp;
+
+ ret = bpf_get_helper_proto(env, insn->imm, &fp);
+ /* If called in a non-sleepable context program will be
+ * rejected anyway, so we should end up with precise
+ * sleepable marks on subprogs, except for dead code
+ * elimination.
+ */
+ if (ret == 0 && fp->might_sleep)
+ mark_subprog_might_sleep(env, t);
+ if (bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
+ if (insn->imm == BPF_FUNC_tail_call) {
+ ret = visit_abnormal_return_insn(env, t);
+ if (ret)
+ return ret;
+ }
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
+ if (ret == 0 && bpf_is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ /* Same as helpers, if called in a non-sleepable context
+ * program will be rejected anyway, so we should end up
+ * with precise sleepable marks on subprogs, except for
+ * dead code elimination.
+ */
+ if (ret == 0 && bpf_is_kfunc_sleepable(&meta))
+ mark_subprog_might_sleep(env, t);
+ if (ret == 0 && bpf_is_kfunc_pkt_changing(&meta))
+ mark_subprog_changes_pkt_data(env, t);
+ }
+ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
+
+ case BPF_JA:
+ if (BPF_SRC(insn->code) == BPF_X)
+ return visit_gotox_insn(t, env);
+
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ off = insn->off;
+ else
+ off = insn->imm;
+
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ mark_prune_point(env, t + off + 1);
+ mark_jmp_point(env, t + off + 1);
+
+ return ret;
+
+ default:
+ /* conditional jump with two edges */
+ mark_prune_point(env, t);
+ if (bpf_is_may_goto_insn(insn))
+ mark_force_checkpoint(env, t);
+
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ return push_insn(t, t + insn->off + 1, BRANCH, env);
+ }
+}
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+int bpf_check_cfg(struct bpf_verifier_env *env)
+{
+ int insn_cnt = env->prog->len;
+ int *insn_stack, *insn_state;
+ int ex_insn_beg, i, ret = 0;
+
+ insn_state = env->cfg.insn_state = kvzalloc_objs(int, insn_cnt,
+ GFP_KERNEL_ACCOUNT);
+ if (!insn_state)
+ return -ENOMEM;
+
+ insn_stack = env->cfg.insn_stack = kvzalloc_objs(int, insn_cnt,
+ GFP_KERNEL_ACCOUNT);
+ if (!insn_stack) {
+ kvfree(insn_state);
+ return -ENOMEM;
+ }
+
+ ex_insn_beg = env->exception_callback_subprog
+ ? env->subprog_info[env->exception_callback_subprog].start
+ : 0;
+
+ insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+ insn_stack[0] = 0; /* 0 is the first instruction */
+ env->cfg.cur_stack = 1;
+
+walk_cfg:
+ while (env->cfg.cur_stack > 0) {
+ int t = insn_stack[env->cfg.cur_stack - 1];
+
+ ret = visit_insn(t, env);
+ switch (ret) {
+ case DONE_EXPLORING:
+ insn_state[t] = EXPLORED;
+ env->cfg.cur_stack--;
+ break;
+ case KEEP_EXPLORING:
+ break;
+ default:
+ if (ret > 0) {
+ verifier_bug(env, "visit_insn internal bug");
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+ }
+
+ if (env->cfg.cur_stack < 0) {
+ verifier_bug(env, "pop stack internal bug");
+ ret = -EFAULT;
+ goto err_free;
+ }
+
+ if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
+ insn_state[ex_insn_beg] = DISCOVERED;
+ insn_stack[0] = ex_insn_beg;
+ env->cfg.cur_stack = 1;
+ goto walk_cfg;
+ }
+
+ for (i = 0; i < insn_cnt; i++) {
+ struct bpf_insn *insn = &env->prog->insnsi[i];
+
+ if (insn_state[i] != EXPLORED) {
+ verbose(env, "unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ if (bpf_is_ldimm64(insn)) {
+ if (insn_state[i + 1] != 0) {
+ verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ i++; /* skip second half of ldimm64 */
+ }
+ }
+ ret = 0; /* cfg looks good */
+ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
+ env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;
+
+err_free:
+ kvfree(insn_state);
+ kvfree(insn_stack);
+ env->cfg.insn_state = env->cfg.insn_stack = NULL;
+ return ret;
+}
+
+/*
+ * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
+ * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
+ * with indices of 'i' instructions in postorder.
+ */
+int bpf_compute_postorder(struct bpf_verifier_env *env)
+{
+ u32 cur_postorder, i, top, stack_sz, s;
+ int *stack = NULL, *postorder = NULL, *state = NULL;
+ struct bpf_iarray *succ;
+
+ postorder = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
+ state = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
+ stack = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
+ if (!postorder || !state || !stack) {
+ kvfree(postorder);
+ kvfree(state);
+ kvfree(stack);
+ return -ENOMEM;
+ }
+ cur_postorder = 0;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ env->subprog_info[i].postorder_start = cur_postorder;
+ stack[0] = env->subprog_info[i].start;
+ stack_sz = 1;
+ do {
+ top = stack[stack_sz - 1];
+ state[top] |= DISCOVERED;
+ if (state[top] & EXPLORED) {
+ postorder[cur_postorder++] = top;
+ stack_sz--;
+ continue;
+ }
+ succ = bpf_insn_successors(env, top);
+ for (s = 0; s < succ->cnt; ++s) {
+ if (!state[succ->items[s]]) {
+ stack[stack_sz++] = succ->items[s];
+ state[succ->items[s]] |= DISCOVERED;
+ }
+ }
+ state[top] |= EXPLORED;
+ } while (stack_sz);
+ }
+ env->subprog_info[i].postorder_start = cur_postorder;
+ env->cfg.insn_postorder = postorder;
+ env->cfg.cur_postorder = cur_postorder;
+ kvfree(stack);
+ kvfree(state);
+ return 0;
+}
+
+/*
+ * Compute strongly connected components (SCCs) on the CFG.
+ * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
+ * If instruction is a sole member of its SCC and there are no self edges,
+ * assign it SCC number of zero.
+ * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
+ */
+int bpf_compute_scc(struct bpf_verifier_env *env)
+{
+ const u32 NOT_ON_STACK = U32_MAX;
+
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ const u32 insn_cnt = env->prog->len;
+ int stack_sz, dfs_sz, err = 0;
+ u32 *stack, *pre, *low, *dfs;
+ u32 i, j, t, w;
+ u32 next_preorder_num;
+ u32 next_scc_id;
+ bool assign_scc;
+ struct bpf_iarray *succ;
+
+ next_preorder_num = 1;
+ next_scc_id = 1;
+ /*
+ * - 'stack' accumulates vertices in DFS order, see invariant comment below;
+ * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
+ * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
+ * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
+ */
+ stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT);
+ if (!stack || !pre || !low || !dfs) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ /*
+ * References:
+ * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
+ * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
+ *
+ * The algorithm maintains the following invariant:
+ * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
+ * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
+ *
+ * Consequently:
+ * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
+ * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
+ * and thus there is an SCC (loop) containing both 'u' and 'v'.
+ * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
+ * and 'v' can be considered the root of some SCC.
+ *
+ * Here is a pseudo-code for an explicitly recursive version of the algorithm:
+ *
+ * NOT_ON_STACK = insn_cnt + 1
+ * pre = [0] * insn_cnt
+ * low = [0] * insn_cnt
+ * scc = [0] * insn_cnt
+ * stack = []
+ *
+ * next_preorder_num = 1
+ * next_scc_id = 1
+ *
+ * def recur(w):
+ * nonlocal next_preorder_num
+ * nonlocal next_scc_id
+ *
+ * pre[w] = next_preorder_num
+ * low[w] = next_preorder_num
+ * next_preorder_num += 1
+ * stack.append(w)
+ * for s in successors(w):
+ * # Note: for classic algorithm the block below should look as:
+ * #
+ * # if pre[s] == 0:
+ * # recur(s)
+ * # low[w] = min(low[w], low[s])
+ * # elif low[s] != NOT_ON_STACK:
+ * # low[w] = min(low[w], pre[s])
+ * #
+ * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
+ * # does not break the invariant and makes iterative version of the algorithm
+ * # simpler. See 'Algorithm #3' from [2].
+ *
+ * # 's' not yet visited
+ * if pre[s] == 0:
+ * recur(s)
+ * # if 's' is on stack, pick lowest reachable preorder number from it;
+ * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
+ * # so 'min' would be a noop.
+ * low[w] = min(low[w], low[s])
+ *
+ * if low[w] == pre[w]:
+ * # 'w' is the root of an SCC, pop all vertices
+ * # below 'w' on stack and assign same SCC to them.
+ * while True:
+ * t = stack.pop()
+ * low[t] = NOT_ON_STACK
+ * scc[t] = next_scc_id
+ * if t == w:
+ * break
+ * next_scc_id += 1
+ *
+ * for i in range(0, insn_cnt):
+ * if pre[i] == 0:
+ * recur(i)
+ *
+ * Below implementation replaces explicit recursion with array 'dfs'.
+ */
+ for (i = 0; i < insn_cnt; i++) {
+ if (pre[i])
+ continue;
+ stack_sz = 0;
+ dfs_sz = 1;
+ dfs[0] = i;
+dfs_continue:
+ while (dfs_sz) {
+ w = dfs[dfs_sz - 1];
+ if (pre[w] == 0) {
+ low[w] = next_preorder_num;
+ pre[w] = next_preorder_num;
+ next_preorder_num++;
+ stack[stack_sz++] = w;
+ }
+ /* Visit 'w' successors */
+ succ = bpf_insn_successors(env, w);
+ for (j = 0; j < succ->cnt; ++j) {
+ if (pre[succ->items[j]]) {
+ low[w] = min(low[w], low[succ->items[j]]);
+ } else {
+ dfs[dfs_sz++] = succ->items[j];
+ goto dfs_continue;
+ }
+ }
+ /*
+ * Preserve the invariant: if some vertex above in the stack
+ * is reachable from 'w', keep 'w' on the stack.
+ */
+ if (low[w] < pre[w]) {
+ dfs_sz--;
+ goto dfs_continue;
+ }
+ /*
+ * Assign SCC number only if component has two or more elements,
+ * or if component has a self reference, or if instruction is a
+ * callback calling function (implicit loop).
+ */
+ assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */
+ for (j = 0; j < succ->cnt; ++j) { /* self reference? */
+ if (succ->items[j] == w) {
+ assign_scc = true;
+ break;
+ }
+ }
+ if (bpf_calls_callback(env, w)) /* implicit loop? */
+ assign_scc = true;
+ /* Pop component elements from stack */
+ do {
+ t = stack[--stack_sz];
+ low[t] = NOT_ON_STACK;
+ if (assign_scc)
+ aux[t].scc = next_scc_id;
+ } while (t != w);
+ if (assign_scc)
+ next_scc_id++;
+ dfs_sz--;
+ }
+ }
+ env->scc_info = kvzalloc_objs(*env->scc_info, next_scc_id,
+ GFP_KERNEL_ACCOUNT);
+ if (!env->scc_info) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ env->scc_cnt = next_scc_id;
+exit:
+ kvfree(stack);
+ kvfree(pre);
+ kvfree(low);
+ kvfree(dfs);
+ return err;
+}
diff --git a/kernel/bpf/check_btf.c b/kernel/bpf/check_btf.c
new file mode 100644
index 000000000000..93bebe6fe12e
--- /dev/null
+++ b/kernel/bpf/check_btf.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+#include <linux/btf.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+static int check_abnormal_return(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ if (env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* The minimum supported BTF func info size */
+#define MIN_BPF_FUNCINFO_SIZE 8
+#define MAX_FUNCINFO_REC_SIZE 252
+
+static int check_btf_func_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 krec_size = sizeof(struct bpf_func_info);
+ const struct btf_type *type, *func_proto;
+ u32 i, nfuncs, urec_size, min_size;
+ struct bpf_func_info *krecord;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ u32 prev_offset = 0;
+ bpfptr_t urecord;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ urec_size = attr->func_info_rec_size;
+ if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
+ urec_size > MAX_FUNCINFO_REC_SIZE ||
+ urec_size % sizeof(u32)) {
+ verbose(env, "invalid func info rec size %u\n", urec_size);
+ return -EINVAL;
+ }
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+ min_size = min_t(u32, krec_size, urec_size);
+
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!krecord)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
+ if (ret) {
+ if (ret == -E2BIG) {
+ verbose(env, "nonzero tailing record in func info");
+ /* set the size kernel expects so loader can zero
+ * out the rest of the record.
+ */
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, func_info_rec_size),
+ &min_size, sizeof(min_size)))
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
+ ret = -EFAULT;
+ goto err_free;
+ }
+
+ /* check insn_off */
+ ret = -EINVAL;
+ if (i == 0) {
+ if (krecord[i].insn_off) {
+ verbose(env,
+ "nonzero insn_off %u for the first func info record",
+ krecord[i].insn_off);
+ goto err_free;
+ }
+ } else if (krecord[i].insn_off <= prev_offset) {
+ verbose(env,
+ "same or smaller insn offset (%u) than previous func info record (%u)",
+ krecord[i].insn_off, prev_offset);
+ goto err_free;
+ }
+
+ /* check type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ if (!type || !btf_type_is_func(type)) {
+ verbose(env, "invalid type id %d in func info",
+ krecord[i].type_id);
+ goto err_free;
+ }
+
+ func_proto = btf_type_by_id(btf, type->type);
+ if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
+ /* btf_func_check() already verified it during BTF load */
+ goto err_free;
+
+ prev_offset = krecord[i].insn_off;
+ bpfptr_add(&urecord, urec_size);
+ }
+
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ const struct btf_type *type, *func_proto, *ret_type;
+ u32 i, nfuncs, urec_size;
+ struct bpf_func_info *krecord;
+ struct bpf_func_info_aux *info_aux = NULL;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t urecord;
+ bool scalar_return;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+
+ krecord = prog->aux->func_info;
+ info_aux = kzalloc_objs(*info_aux, nfuncs,
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!info_aux)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ /* check insn_off */
+ ret = -EINVAL;
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ goto err_free;
+ }
+
+ /* Already checked type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+ /* Already checked func_proto */
+ func_proto = btf_type_by_id(btf, type->type);
+
+ ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ scalar_return =
+ btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
+ if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+ if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+
+ env->subprog_info[i].name = btf_name_by_offset(btf, type->name_off);
+ bpfptr_add(&urecord, urec_size);
+ }
+
+ prog->aux->func_info_aux = info_aux;
+ return 0;
+
+err_free:
+ kfree(info_aux);
+ return ret;
+}
+
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
+#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_btf_line(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
+ struct bpf_subprog_info *sub;
+ struct bpf_line_info *linfo;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t ulinfo;
+ int err;
+
+ nr_linfo = attr->line_info_cnt;
+ if (!nr_linfo)
+ return 0;
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
+ return -EINVAL;
+
+ rec_size = attr->line_info_rec_size;
+ if (rec_size < MIN_BPF_LINEINFO_SIZE ||
+ rec_size > MAX_LINEINFO_REC_SIZE ||
+ rec_size & (sizeof(u32) - 1))
+ return -EINVAL;
+
+ /* Need to zero it in case the userspace may
+ * pass in a smaller bpf_line_info object.
+ */
+ linfo = kvzalloc_objs(struct bpf_line_info, nr_linfo,
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!linfo)
+ return -ENOMEM;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ s = 0;
+ sub = env->subprog_info;
+ ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_line_info);
+ ncopy = min_t(u32, expected_size, rec_size);
+ for (i = 0; i < nr_linfo; i++) {
+ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in line_info");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, line_info_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ /*
+ * Check insn_off to ensure
+ * 1) strictly increasing AND
+ * 2) bounded by prog->len
+ *
+ * The linfo[0].insn_off == 0 check logically falls into
+ * the later "missing bpf_line_info for func..." case
+ * because the first linfo[0].insn_off must be the
+ * first sub also and the first sub must have
+ * subprog_info[0].start == 0.
+ */
+ if ((i && linfo[i].insn_off <= prev_offset) ||
+ linfo[i].insn_off >= prog->len) {
+ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
+ i, linfo[i].insn_off, prev_offset,
+ prog->len);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!prog->insnsi[linfo[i].insn_off].code) {
+ verbose(env,
+ "Invalid insn code at line_info[%u].insn_off\n",
+ i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!btf_name_by_offset(btf, linfo[i].line_off) ||
+ !btf_name_by_offset(btf, linfo[i].file_name_off)) {
+ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (s != env->subprog_cnt) {
+ if (linfo[i].insn_off == sub[s].start) {
+ sub[s].linfo_idx = i;
+ s++;
+ } else if (sub[s].start < linfo[i].insn_off) {
+ verbose(env, "missing bpf_line_info for func#%u\n", s);
+ err = -EINVAL;
+ goto err_free;
+ }
+ }
+
+ prev_offset = linfo[i].insn_off;
+ bpfptr_add(&ulinfo, rec_size);
+ }
+
+ if (s != env->subprog_cnt) {
+ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
+ env->subprog_cnt - s, s);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ prog->aux->linfo = linfo;
+ prog->aux->nr_linfo = nr_linfo;
+
+ return 0;
+
+err_free:
+ kvfree(linfo);
+ return err;
+}
+
+#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+ struct bpf_core_relo core_relo = {};
+ struct bpf_prog *prog = env->prog;
+ const struct btf *btf = prog->aux->btf;
+ struct bpf_core_ctx ctx = {
+ .log = &env->log,
+ .btf = btf,
+ };
+ bpfptr_t u_core_relo;
+ int err;
+
+ nr_core_relo = attr->core_relo_cnt;
+ if (!nr_core_relo)
+ return 0;
+ if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+ return -EINVAL;
+
+ rec_size = attr->core_relo_rec_size;
+ if (rec_size < MIN_CORE_RELO_SIZE ||
+ rec_size > MAX_CORE_RELO_SIZE ||
+ rec_size % sizeof(u32))
+ return -EINVAL;
+
+ u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_core_relo);
+ ncopy = min_t(u32, expected_size, rec_size);
+
+ /* Unlike func_info and line_info, copy and apply each CO-RE
+ * relocation record one at a time.
+ */
+ for (i = 0; i < nr_core_relo; i++) {
+ /* future proofing when sizeof(bpf_core_relo) changes */
+ err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in core_relo");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, core_relo_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ break;
+ }
+
+ if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+ verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+ i, core_relo.insn_off, prog->len);
+ err = -EINVAL;
+ break;
+ }
+
+ err = bpf_core_apply(&ctx, &core_relo, i,
+ &prog->insnsi[core_relo.insn_off / 8]);
+ if (err)
+ break;
+ bpfptr_add(&u_core_relo, rec_size);
+ }
+ return err;
+}
+
+int bpf_check_btf_info_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ struct btf *btf;
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ btf = btf_get_by_fd(attr->prog_btf_fd);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ return -EACCES;
+ }
+ env->prog->aux->btf = btf;
+
+ err = check_btf_func_early(env, attr, uattr);
+ if (err)
+ return err;
+ return 0;
+}
+
+int bpf_check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ err = check_btf_func(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_btf_line(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_core_relo(env, attr, uattr);
+ if (err)
+ return err;
+
+ return 0;
+}
diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c
new file mode 100644
index 000000000000..db73c4740b1e
--- /dev/null
+++ b/kernel/bpf/const_fold.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf_verifier.h>
+
+/*
+ * Forward dataflow analysis to determine constant register values at every
+ * instruction. Tracks 64-bit constant values in R0-R9 through the program,
+ * using a fixed-point iteration in reverse postorder. Records which registers
+ * hold known constants and their values in
+ * env->insn_aux_data[].{const_reg_mask, const_reg_vals}.
+ */
+
+enum const_arg_state {
+ CONST_ARG_UNVISITED, /* instruction not yet reached */
+ CONST_ARG_UNKNOWN, /* register value not a known constant */
+ CONST_ARG_CONST, /* register holds a known 64-bit constant */
+ CONST_ARG_MAP_PTR, /* register holds a map pointer, map_index is set */
+ CONST_ARG_MAP_VALUE, /* register points to map value data, val is offset */
+ CONST_ARG_SUBPROG, /* register holds a subprog pointer, val is subprog number */
+};
+
+struct const_arg_info {
+ enum const_arg_state state;
+ u32 map_index;
+ u64 val;
+};
+
+static bool ci_is_unvisited(const struct const_arg_info *ci)
+{
+ return ci->state == CONST_ARG_UNVISITED;
+}
+
+static bool ci_is_unknown(const struct const_arg_info *ci)
+{
+ return ci->state == CONST_ARG_UNKNOWN;
+}
+
+static bool ci_is_const(const struct const_arg_info *ci)
+{
+ return ci->state == CONST_ARG_CONST;
+}
+
+static bool ci_is_map_value(const struct const_arg_info *ci)
+{
+ return ci->state == CONST_ARG_MAP_VALUE;
+}
+
+/* Transfer function: compute output register state from instruction. */
+static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info *ci_out,
+ struct bpf_insn *insn, struct bpf_insn *insns, int idx)
+{
+ struct const_arg_info unknown = { .state = CONST_ARG_UNKNOWN, .val = 0 };
+ struct const_arg_info *dst = &ci_out[insn->dst_reg];
+ struct const_arg_info *src = &ci_out[insn->src_reg];
+ u8 class = BPF_CLASS(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code);
+ int r;
+
+ switch (class) {
+ case BPF_ALU:
+ case BPF_ALU64:
+ switch (opcode) {
+ case BPF_MOV | BPF_K:
+ dst->state = CONST_ARG_CONST;
+ dst->val = (s64)insn->imm;
+ break;
+ case BPF_MOV | BPF_X:
+ *dst = *src;
+ if (!insn->off)
+ break;
+ if (!ci_is_const(dst)) {
+ *dst = unknown;
+ break;
+ }
+ switch (insn->off) {
+ case 8: dst->val = (s8)dst->val; break;
+ case 16: dst->val = (s16)dst->val; break;
+ case 32: dst->val = (s32)dst->val; break;
+ default: *dst = unknown; break;
+ }
+ break;
+ case BPF_ADD | BPF_K:
+ if (!ci_is_const(dst) && !ci_is_map_value(dst)) {
+ *dst = unknown;
+ break;
+ }
+ dst->val += insn->imm;
+ break;
+ case BPF_SUB | BPF_K:
+ if (!ci_is_const(dst) && !ci_is_map_value(dst)) {
+ *dst = unknown;
+ break;
+ }
+ dst->val -= insn->imm;
+ break;
+ case BPF_AND | BPF_K:
+ if (!ci_is_const(dst)) {
+ if (!insn->imm) {
+ dst->state = CONST_ARG_CONST;
+ dst->val = 0;
+ } else {
+ *dst = unknown;
+ }
+ break;
+ }
+ dst->val &= (s64)insn->imm;
+ break;
+ case BPF_AND | BPF_X:
+ if (ci_is_const(dst) && dst->val == 0)
+ break; /* 0 & x == 0 */
+ if (ci_is_const(src) && src->val == 0) {
+ dst->state = CONST_ARG_CONST;
+ dst->val = 0;
+ break;
+ }
+ if (!ci_is_const(dst) || !ci_is_const(src)) {
+ *dst = unknown;
+ break;
+ }
+ dst->val &= src->val;
+ break;
+ default:
+ *dst = unknown;
+ break;
+ }
+ if (class == BPF_ALU) {
+ if (ci_is_const(dst))
+ dst->val = (u32)dst->val;
+ else if (!ci_is_unknown(dst))
+ *dst = unknown;
+ }
+ break;
+ case BPF_LD:
+ if (mode == BPF_ABS || mode == BPF_IND)
+ goto process_call;
+ if (mode != BPF_IMM || BPF_SIZE(insn->code) != BPF_DW)
+ break;
+ if (insn->src_reg == BPF_PSEUDO_FUNC) {
+ int subprog = bpf_find_subprog(env, idx + insn->imm + 1);
+
+ if (subprog >= 0) {
+ dst->state = CONST_ARG_SUBPROG;
+ dst->val = subprog;
+ } else {
+ *dst = unknown;
+ }
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+ dst->state = CONST_ARG_MAP_VALUE;
+ dst->map_index = env->insn_aux_data[idx].map_index;
+ dst->val = env->insn_aux_data[idx].map_off;
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX) {
+ dst->state = CONST_ARG_MAP_PTR;
+ dst->map_index = env->insn_aux_data[idx].map_index;
+ } else if (insn->src_reg == 0) {
+ dst->state = CONST_ARG_CONST;
+ dst->val = (u64)(u32)insn->imm | ((u64)(u32)insns[idx + 1].imm << 32);
+ } else {
+ *dst = unknown;
+ }
+ break;
+ case BPF_LDX:
+ if (!ci_is_map_value(src)) {
+ *dst = unknown;
+ break;
+ }
+ struct bpf_map *map = env->used_maps[src->map_index];
+ int size = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ bool is_ldsx = mode == BPF_MEMSX;
+ int off = src->val + insn->off;
+ u64 val = 0;
+
+ if (!bpf_map_is_rdonly(map) || !map->ops->map_direct_value_addr ||
+ map->map_type == BPF_MAP_TYPE_INSN_ARRAY ||
+ off < 0 || off + size > map->value_size ||
+ bpf_map_direct_read(map, off, size, &val, is_ldsx)) {
+ *dst = unknown;
+ break;
+ }
+ dst->state = CONST_ARG_CONST;
+ dst->val = val;
+ break;
+ case BPF_JMP:
+ if (opcode != BPF_CALL)
+ break;
+process_call:
+ for (r = BPF_REG_0; r <= BPF_REG_5; r++)
+ ci_out[r] = unknown;
+ break;
+ case BPF_STX:
+ if (mode != BPF_ATOMIC)
+ break;
+ if (insn->imm == BPF_CMPXCHG)
+ ci_out[BPF_REG_0] = unknown;
+ else if (insn->imm == BPF_LOAD_ACQ)
+ *dst = unknown;
+ else if (insn->imm & BPF_FETCH)
+ *src = unknown;
+ break;
+ }
+}
+
+/* Join function: merge output state into a successor's input state. */
+static bool const_reg_join(struct const_arg_info *ci_target,
+ struct const_arg_info *ci_out)
+{
+ bool changed = false;
+ int r;
+
+ for (r = 0; r < MAX_BPF_REG; r++) {
+ struct const_arg_info *old = &ci_target[r];
+ struct const_arg_info *new = &ci_out[r];
+
+ if (ci_is_unvisited(old) && !ci_is_unvisited(new)) {
+ ci_target[r] = *new;
+ changed = true;
+ } else if (!ci_is_unknown(old) && !ci_is_unvisited(old) &&
+ (new->state != old->state || new->val != old->val ||
+ new->map_index != old->map_index)) {
+ old->state = CONST_ARG_UNKNOWN;
+ changed = true;
+ }
+ }
+ return changed;
+}
+
+int bpf_compute_const_regs(struct bpf_verifier_env *env)
+{
+ struct const_arg_info unknown = { .state = CONST_ARG_UNKNOWN, .val = 0 };
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ struct const_arg_info (*ci_in)[MAX_BPF_REG];
+ struct const_arg_info ci_out[MAX_BPF_REG];
+ struct bpf_iarray *succ;
+ bool changed;
+ int i, r;
+
+ /* kvzalloc zeroes memory, so all entries start as CONST_ARG_UNVISITED (0) */
+ ci_in = kvzalloc_objs(*ci_in, insn_cnt, GFP_KERNEL_ACCOUNT);
+ if (!ci_in)
+ return -ENOMEM;
+
+ /* Subprogram entries (including main at subprog 0): all registers unknown */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ int start = env->subprog_info[i].start;
+
+ for (r = 0; r < MAX_BPF_REG; r++)
+ ci_in[start][r] = unknown;
+ }
+
+redo:
+ changed = false;
+ for (i = env->cfg.cur_postorder - 1; i >= 0; i--) {
+ int idx = env->cfg.insn_postorder[i];
+ struct bpf_insn *insn = &insns[idx];
+ struct const_arg_info *ci = ci_in[idx];
+
+ memcpy(ci_out, ci, sizeof(ci_out));
+
+ const_reg_xfer(env, ci_out, insn, insns, idx);
+
+ succ = bpf_insn_successors(env, idx);
+ for (int s = 0; s < succ->cnt; s++)
+ changed |= const_reg_join(ci_in[succ->items[s]], ci_out);
+ }
+ if (changed)
+ goto redo;
+
+ /* Save computed constants into insn_aux[] if they fit into 32-bit */
+ for (i = 0; i < insn_cnt; i++) {
+ u16 mask = 0, map_mask = 0, subprog_mask = 0;
+ struct bpf_insn_aux_data *aux = &insn_aux[i];
+ struct const_arg_info *ci = ci_in[i];
+
+ for (r = BPF_REG_0; r < ARRAY_SIZE(aux->const_reg_vals); r++) {
+ struct const_arg_info *c = &ci[r];
+
+ switch (c->state) {
+ case CONST_ARG_CONST: {
+ u64 val = c->val;
+
+ if (val != (u32)val)
+ break;
+ mask |= BIT(r);
+ aux->const_reg_vals[r] = val;
+ break;
+ }
+ case CONST_ARG_MAP_PTR:
+ map_mask |= BIT(r);
+ aux->const_reg_vals[r] = c->map_index;
+ break;
+ case CONST_ARG_SUBPROG:
+ subprog_mask |= BIT(r);
+ aux->const_reg_vals[r] = c->val;
+ break;
+ default:
+ break;
+ }
+ }
+ aux->const_reg_mask = mask;
+ aux->const_reg_map_mask = map_mask;
+ aux->const_reg_subprog_mask = subprog_mask;
+ }
+
+ kvfree(ci_in);
+ return 0;
+}
+
+static int eval_const_branch(u8 opcode, u64 dst_val, u64 src_val)
+{
+ switch (BPF_OP(opcode)) {
+ case BPF_JEQ: return dst_val == src_val;
+ case BPF_JNE: return dst_val != src_val;
+ case BPF_JGT: return dst_val > src_val;
+ case BPF_JGE: return dst_val >= src_val;
+ case BPF_JLT: return dst_val < src_val;
+ case BPF_JLE: return dst_val <= src_val;
+ case BPF_JSGT: return (s64)dst_val > (s64)src_val;
+ case BPF_JSGE: return (s64)dst_val >= (s64)src_val;
+ case BPF_JSLT: return (s64)dst_val < (s64)src_val;
+ case BPF_JSLE: return (s64)dst_val <= (s64)src_val;
+ case BPF_JSET: return (bool)(dst_val & src_val);
+ default: return -1;
+ }
+}
+
+/*
+ * Rewrite conditional branches with constant outcomes into unconditional
+ * jumps using register values resolved by bpf_compute_const_regs() pass.
+ * This eliminates dead edges from the CFG so that compute_live_registers()
+ * doesn't propagate liveness through dead code.
+ */
+int bpf_prune_dead_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ bool changed = false;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ struct bpf_insn_aux_data *aux = &insn_aux[i];
+ struct bpf_insn *insn = &insns[i];
+ u8 class = BPF_CLASS(insn->code);
+ u64 dst_val, src_val;
+ int taken;
+
+ if (!bpf_insn_is_cond_jump(insn->code))
+ continue;
+ if (bpf_is_may_goto_insn(insn))
+ continue;
+
+ if (!(aux->const_reg_mask & BIT(insn->dst_reg)))
+ continue;
+ dst_val = aux->const_reg_vals[insn->dst_reg];
+
+ if (BPF_SRC(insn->code) == BPF_K) {
+ src_val = insn->imm;
+ } else {
+ if (!(aux->const_reg_mask & BIT(insn->src_reg)))
+ continue;
+ src_val = aux->const_reg_vals[insn->src_reg];
+ }
+
+ if (class == BPF_JMP32) {
+ /*
+ * The (s32) cast maps the 32-bit range into two u64 sub-ranges:
+ * [0x00000000, 0x7FFFFFFF] -> [0x0000000000000000, 0x000000007FFFFFFF]
+ * [0x80000000, 0xFFFFFFFF] -> [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+ * The ordering is preserved within each sub-range, and
+ * the second sub-range is above the first as u64.
+ */
+ dst_val = (s32)dst_val;
+ src_val = (s32)src_val;
+ }
+
+ taken = eval_const_branch(insn->code, dst_val, src_val);
+ if (taken < 0) {
+ bpf_log(&env->log, "Unknown conditional jump %x\n", insn->code);
+ return -EFAULT;
+ }
+ *insn = BPF_JMP_A(taken ? insn->off : 0);
+ changed = true;
+ }
+
+ if (!changed)
+ return 0;
+ /* recompute postorder, since CFG has changed */
+ kvfree(env->cfg.insn_postorder);
+ env->cfg.insn_postorder = NULL;
+ return bpf_compute_postorder(env);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3ece2da55625..8b018ff48875 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -18,7 +18,6 @@
*/
#include <uapi/linux/btf.h>
-#include <crypto/sha1.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
@@ -1422,6 +1421,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
+
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^
+ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ /*
+ * Cannot use BPF_STX_MEM() macro here as it
+ * hardcodes BPF_MEM mode, losing PROBE_MEM32
+ * and breaking arena addressing in the JIT.
+ */
+ *to++ = (struct bpf_insn) {
+ .code = BPF_STX | BPF_PROBE_MEM32 |
+ BPF_SIZE(from->code),
+ .dst_reg = from->dst_reg,
+ .src_reg = BPF_REG_AX,
+ .off = from->off,
+ };
+ break;
}
out:
return to - to_buff;
@@ -1466,27 +1486,16 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
* know whether fp here is the clone or the original.
*/
fp->aux->prog = fp;
+ if (fp->aux->offload)
+ fp->aux->offload->prog = fp;
bpf_prog_clone_free(fp_other);
}
-static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len)
-{
-#ifdef CONFIG_BPF_SYSCALL
- struct bpf_map *map;
- int i;
-
- if (len <= 1)
- return;
-
- for (i = 0; i < prog->aux->used_map_cnt; i++) {
- map = prog->aux->used_maps[i];
- if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
- bpf_insn_array_adjust(map, off, len);
- }
-#endif
-}
-
-struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
+/*
+ * Now this function is used only to blind the main prog and must be invoked only when
+ * bpf_prog_need_blind() returns true.
+ */
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog)
{
struct bpf_insn insn_buff[16], aux[2];
struct bpf_prog *clone, *tmp;
@@ -1494,13 +1503,17 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
- if (!prog->blinding_requested || prog->blinded)
- return prog;
+ if (WARN_ON_ONCE(env && env->prog != prog))
+ return ERR_PTR(-EINVAL);
clone = bpf_prog_clone_create(prog, GFP_USER);
if (!clone)
return ERR_PTR(-ENOMEM);
+ /* make sure bpf_patch_insn_data() patches the correct prog */
+ if (env)
+ env->prog = clone;
+
insn_cnt = clone->len;
insn = clone->insnsi;
@@ -1528,21 +1541,28 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
if (!rewritten)
continue;
- tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
- if (IS_ERR(tmp)) {
+ if (env)
+ tmp = bpf_patch_insn_data(env, i, insn_buff, rewritten);
+ else
+ tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
+
+ if (IS_ERR_OR_NULL(tmp)) {
+ if (env)
+ /* restore the original prog */
+ env->prog = prog;
/* Patching may have repointed aux->prog during
* realloc from the original one, so we need to
* fix it up here on error.
*/
bpf_jit_prog_release_other(prog, clone);
- return tmp;
+ return IS_ERR(tmp) ? tmp : ERR_PTR(-ENOMEM);
}
clone = tmp;
insn_delta = rewritten - 1;
- /* Instructions arrays must be updated using absolute xlated offsets */
- adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten);
+ if (env)
+ env->prog = clone;
/* Walk new program and skip insns we just inserted. */
insn = clone->insnsi + i + insn_delta;
@@ -1553,6 +1573,15 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
clone->blinded = 1;
return clone;
}
+
+bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog,
+ int insn_idx)
+{
+ if (!env)
+ return false;
+ insn_idx += prog->aux->subprog_start;
+ return env->insn_aux_data[insn_idx].indirect_target;
+}
#endif /* CONFIG_BPF_JIT */
/* Base function for offset calculation. Needs to go into .text section,
@@ -1736,6 +1765,12 @@ bool bpf_opcode_in_insntable(u8 code)
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+/* Absolute value of s32 without undefined behavior for S32_MIN */
+static u32 abs_s32(s32 x)
+{
+ return x >= 0 ? (u32)x : -(u32)x;
+}
+
/**
* ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
@@ -1900,8 +1935,8 @@ select_insn:
DST = do_div(AX, (u32) SRC);
break;
case 1:
- AX = abs((s32)DST);
- AX = do_div(AX, abs((s32)SRC));
+ AX = abs_s32((s32)DST);
+ AX = do_div(AX, abs_s32((s32)SRC));
if ((s32)DST < 0)
DST = (u32)-AX;
else
@@ -1928,8 +1963,8 @@ select_insn:
DST = do_div(AX, (u32) IMM);
break;
case 1:
- AX = abs((s32)DST);
- AX = do_div(AX, abs((s32)IMM));
+ AX = abs_s32((s32)DST);
+ AX = do_div(AX, abs_s32((s32)IMM));
if ((s32)DST < 0)
DST = (u32)-AX;
else
@@ -1955,8 +1990,8 @@ select_insn:
DST = (u32) AX;
break;
case 1:
- AX = abs((s32)DST);
- do_div(AX, abs((s32)SRC));
+ AX = abs_s32((s32)DST);
+ do_div(AX, abs_s32((s32)SRC));
if (((s32)DST < 0) == ((s32)SRC < 0))
DST = (u32)AX;
else
@@ -1982,8 +2017,8 @@ select_insn:
DST = (u32) AX;
break;
case 1:
- AX = abs((s32)DST);
- do_div(AX, abs((s32)IMM));
+ AX = abs_s32((s32)DST);
+ do_div(AX, abs_s32((s32)IMM));
if (((s32)DST < 0) == ((s32)IMM < 0))
DST = (u32)AX;
else
@@ -2060,12 +2095,12 @@ select_insn:
if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
- tail_call_cnt++;
-
prog = READ_ONCE(array->ptrs[index]);
if (!prog)
goto out;
+ tail_call_cnt++;
+
/* ARG1 at this point is guaranteed to point to CTX from
* the verifier side due to the fact that the tail call is
* handled like a helper, that is, bpf_tail_call_proto,
@@ -2505,18 +2540,55 @@ static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
return select_interpreter;
}
-/**
- * bpf_prog_select_runtime - select exec runtime for BPF program
- * @fp: bpf_prog populated with BPF program
- * @err: pointer to error variable
- *
- * Try to JIT eBPF program, if JIT is not available, use interpreter.
- * The BPF program will be executed via bpf_prog_run() function.
- *
- * Return: the &fp argument along with &err set to 0 for success or
- * a negative errno code on failure
- */
-struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
+static struct bpf_prog *bpf_prog_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog)
+{
+#ifdef CONFIG_BPF_JIT
+ struct bpf_prog *orig_prog;
+ struct bpf_insn_aux_data *orig_insn_aux;
+
+ if (!bpf_prog_need_blind(prog))
+ return bpf_int_jit_compile(env, prog);
+
+ if (env) {
+ /*
+ * If env is not NULL, we are called from the end of bpf_check(), at this
+ * point, only insn_aux_data is used after failure, so it should be restored
+ * on failure.
+ */
+ orig_insn_aux = bpf_dup_insn_aux_data(env);
+ if (!orig_insn_aux)
+ return prog;
+ }
+
+ orig_prog = prog;
+ prog = bpf_jit_blind_constants(env, prog);
+ /*
+ * If blinding was requested and we failed during blinding, we must fall
+ * back to the interpreter.
+ */
+ if (IS_ERR(prog))
+ goto out_restore;
+
+ prog = bpf_int_jit_compile(env, prog);
+ if (prog->jited) {
+ bpf_jit_prog_release_other(prog, orig_prog);
+ if (env)
+ vfree(orig_insn_aux);
+ return prog;
+ }
+
+ bpf_jit_prog_release_other(orig_prog, prog);
+
+out_restore:
+ prog = orig_prog;
+ if (env)
+ bpf_restore_insn_aux_data(env, orig_insn_aux);
+#endif
+ return prog;
+}
+
+struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct bpf_prog *fp,
+ int *err)
{
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
@@ -2544,7 +2616,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
if (*err)
return fp;
- fp = bpf_int_jit_compile(fp);
+ fp = bpf_prog_jit_compile(env, fp);
bpf_prog_jit_attempt_done(fp);
if (!fp->jited && jit_needed) {
*err = -ENOTSUPP;
@@ -2570,6 +2642,22 @@ finalize:
return fp;
}
+
+/**
+ * bpf_prog_select_runtime - select exec runtime for BPF program
+ * @fp: bpf_prog populated with BPF program
+ * @err: pointer to error variable
+ *
+ * Try to JIT eBPF program, if JIT is not available, use interpreter.
+ * The BPF program will be executed via bpf_prog_run() function.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
+ */
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
+{
+ return __bpf_prog_select_runtime(NULL, fp, err);
+}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
static unsigned int __bpf_prog_ret1(const void *ctx,
@@ -2586,8 +2674,10 @@ static struct bpf_prog_dummy {
},
};
-struct bpf_empty_prog_array bpf_empty_prog_array = {
- .null_prog = NULL,
+struct bpf_prog_array bpf_empty_prog_array = {
+ .items = {
+ { .prog = NULL },
+ },
};
EXPORT_SYMBOL(bpf_empty_prog_array);
@@ -2598,14 +2688,14 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
if (prog_cnt)
p = kzalloc_flex(*p, items, prog_cnt + 1, flags);
else
- p = &bpf_empty_prog_array.hdr;
+ p = &bpf_empty_prog_array;
return p;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs || progs == &bpf_empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array)
return;
kfree_rcu(progs, rcu);
}
@@ -2614,19 +2704,17 @@ static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
struct bpf_prog_array *progs;
- /* If RCU Tasks Trace grace period implies RCU grace period, there is
- * no need to call kfree_rcu(), just call kfree() directly.
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, there is no
+ * need to call kfree_rcu(), just call kfree() directly.
*/
progs = container_of(rcu, struct bpf_prog_array, rcu);
- if (rcu_trace_implies_rcu_gp())
- kfree(progs);
- else
- kfree_rcu(progs, rcu);
+ kfree(progs);
}
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
{
- if (!progs || progs == &bpf_empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array)
return;
call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}
@@ -3057,7 +3145,7 @@ const struct bpf_func_proto bpf_tail_call_proto = {
* It is encouraged to implement bpf_int_jit_compile() instead, so that
* eBPF and implicitly also cBPF can get JITed!
*/
-struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog)
{
return prog;
}
@@ -3287,6 +3375,63 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
#ifdef CONFIG_BPF_SYSCALL
+void bpf_get_linfo_file_line(struct btf *btf, const struct bpf_line_info *linfo,
+ const char **filep, const char **linep, int *nump)
+{
+ /* Get base component of the file path. */
+ if (filep) {
+ *filep = btf_name_by_offset(btf, linfo->file_name_off);
+ *filep = kbasename(*filep);
+ }
+
+ /* Obtain the source line, and strip whitespace in prefix. */
+ if (linep) {
+ *linep = btf_name_by_offset(btf, linfo->line_off);
+ while (isspace(**linep))
+ *linep += 1;
+ }
+
+ if (nump)
+ *nump = BPF_LINE_INFO_LINE_NUM(linfo->line_col);
+}
+
+const struct bpf_line_info *bpf_find_linfo(const struct bpf_prog *prog, u32 insn_off)
+{
+ const struct bpf_line_info *linfo;
+ u32 nr_linfo;
+ int l, r, m;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo || insn_off >= prog->len)
+ return NULL;
+
+ linfo = prog->aux->linfo;
+ /* Loop invariant: linfo[l].insn_off <= insns_off.
+ * linfo[0].insn_off == 0 which always satisfies above condition.
+ * Binary search is searching for rightmost linfo entry that satisfies
+ * the above invariant, giving us the desired record that covers given
+ * instruction offset.
+ */
+ l = 0;
+ r = nr_linfo - 1;
+ while (l < r) {
+ /* (r - l + 1) / 2 means we break a tie to the right, so if:
+ * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
+ * then m=2, we see that linfo[m].insn_off > insn_off, and so
+ * r becomes 1 and we exit the loop with correct l==1.
+ * If the tie was broken to the left, m=1 would end us up in
+ * an endless loop where l and m stay at 1 and r stays at 2.
+ */
+ m = l + (r - l + 1) / 2;
+ if (linfo[m].insn_off <= insn_off)
+ l = m;
+ else
+ r = m - 1;
+ }
+
+ return &linfo[l];
+}
+
int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
const char **linep, int *nump)
{
@@ -3321,14 +3466,7 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char *
if (idx == -1)
return -ENOENT;
- /* Get base component of the file path. */
- *filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
- *filep = kbasename(*filep);
- /* Obtain the source line, and strip whitespace in prefix. */
- *linep = btf_name_by_offset(btf, linfo[idx].line_off);
- while (isspace(**linep))
- *linep += 1;
- *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
+ bpf_get_linfo_file_line(btf, &linfo[idx], filep, linep, nump);
return 0;
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 04171fbc39cb..5e59ab896f05 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/local_lock.h>
#include <linux/completion.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
@@ -52,6 +53,7 @@ struct xdp_bulk_queue {
struct list_head flush_node;
struct bpf_cpu_map_entry *obj;
unsigned int count;
+ local_lock_t bq_lock;
};
/* Struct for every remote "destination" CPU in map */
@@ -221,7 +223,10 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
break;
default:
- bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ bpf_warn_invalid_xdp_action(xdpf->dev_rx, rcpu->prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(xdpf->dev_rx, rcpu->prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame(xdpf);
@@ -451,6 +456,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
for_each_possible_cpu(i) {
bq = per_cpu_ptr(rcpu->bulkq, i);
bq->obj = rcpu;
+ local_lock_init(&bq->bq_lock);
}
/* Alloc queue */
@@ -722,6 +728,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
struct ptr_ring *q;
int i;
+ lockdep_assert_held(&bq->bq_lock);
+
if (unlikely(!bq->count))
return;
@@ -749,11 +757,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
}
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+ * Thus, safe percpu variable access. PREEMPT_RT relies on
+ * local_lock_nested_bh() to serialise access to the per-CPU bq.
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+ struct xdp_bulk_queue *bq;
+
+ local_lock_nested_bh(&rcpu->bulkq->bq_lock);
+ bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
bq_flush_to_queue(bq);
@@ -774,6 +786,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
+
+ local_unlock_nested_bh(&rcpu->bulkq->bq_lock);
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -810,7 +824,9 @@ void __cpu_map_flush(struct list_head *flush_list)
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ local_lock_nested_bh(&bq->obj->bulkq->bq_lock);
bq_flush_to_queue(bq);
+ local_unlock_nested_bh(&bq->obj->bulkq->bq_lock);
/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(bq->obj->kthread);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2625601de76e..cc0a43ebab6b 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -45,6 +45,7 @@
* types of devmap; only the lookup and insertion is different.
*/
#include <linux/bpf.h>
+#include <linux/local_lock.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
@@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue {
struct net_device *dev_rx;
struct bpf_prog *xdp_prog;
unsigned int count;
+ local_lock_t bq_lock;
};
struct bpf_dtab_netdev {
@@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
int to_send = cnt;
int i;
+ lockdep_assert_held(&bq->bq_lock);
+
if (unlikely(!cnt))
return;
@@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list)
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
bq_xmit_all(bq, XDP_XMIT_FLUSH);
bq->dev_rx = NULL;
bq->xdp_prog = NULL;
__list_del_clearprev(&bq->flush_node);
+ local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
}
}
@@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
* variable access, and map elements stick around. See comment above
- * xdp_do_flush() in filter.c.
+ * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
+ * to serialise access to the per-CPU bq.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
+ struct xdp_dev_bulk_queue *bq;
+
+ local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
+ bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
bq_xmit_all(bq, 0);
@@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
}
bq->q[bq->count++] = xdpf;
+
+ local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
@@ -588,18 +600,22 @@ static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifin
}
/* Get ifindex of each upper device. 'indexes' must be able to hold at
- * least MAX_NEST_DEV elements.
- * Returns the number of ifindexes added.
+ * least 'max' elements.
+ * Returns the number of ifindexes added, or -EOVERFLOW if there are too
+ * many upper devices.
*/
-static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+static int get_upper_ifindexes(struct net_device *dev, int *indexes, int max)
{
struct net_device *upper;
struct list_head *iter;
int n = 0;
netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ if (n >= max)
+ return -EOVERFLOW;
indexes[n++] = upper->ifindex;
}
+
return n;
}
@@ -615,7 +631,11 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
int err;
if (exclude_ingress) {
- num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices,
+ ARRAY_SIZE(excluded_devices) - 1);
+ if (num_excluded < 0)
+ return num_excluded;
+
excluded_devices[num_excluded++] = dev_rx->ifindex;
}
@@ -645,7 +665,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_rcu(dst, head, index_hlist,
- lockdep_is_held(&dtab->index_lock)) {
+ rcu_read_lock_bh_held()) {
if (!is_valid_dst(dst, xdpf))
continue;
@@ -727,13 +747,16 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
struct bpf_dtab_netdev *dst, *last_dst = NULL;
int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
- struct hlist_node *next;
int num_excluded = 0;
unsigned int i;
int err;
if (exclude_ingress) {
- num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ num_excluded = get_upper_ifindexes(dev, excluded_devices,
+ ARRAY_SIZE(excluded_devices) - 1);
+ if (num_excluded < 0)
+ return num_excluded;
+
excluded_devices[num_excluded++] = dev->ifindex;
}
@@ -763,7 +786,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
- hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ hlist_for_each_entry_rcu(dst, head, index_hlist, rcu_read_lock_bh_held()) {
if (is_ifindex_excluded(excluded_devices, num_excluded,
dst->dev->ifindex))
continue;
@@ -1115,8 +1138,13 @@ static int dev_map_notification(struct notifier_block *notifier,
if (!netdev->xdp_bulkq)
return NOTIFY_BAD;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ for_each_possible_cpu(cpu) {
+ struct xdp_dev_bulk_queue *bq;
+
+ bq = per_cpu_ptr(netdev->xdp_bulkq, cpu);
+ bq->dev = netdev;
+ local_lock_init(&bq->bq_lock);
+ }
break;
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because
diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c
new file mode 100644
index 000000000000..fba9e8c00878
--- /dev/null
+++ b/kernel/bpf/fixups.c
@@ -0,0 +1,2570 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+#include <linux/perf_event.h>
+#include <net/xdp.h>
+#include "disasm.h"
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG;
+}
+
+/* Return the regno defined by the insn, or -1. */
+static int insn_def_regno(const struct bpf_insn *insn)
+{
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_JMP:
+ case BPF_JMP32:
+ case BPF_ST:
+ return -1;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
+ if (insn->imm == BPF_CMPXCHG)
+ return BPF_REG_0;
+ else if (insn->imm == BPF_LOAD_ACQ)
+ return insn->dst_reg;
+ else if (insn->imm & BPF_FETCH)
+ return insn->src_reg;
+ }
+ return -1;
+ default:
+ return insn->dst_reg;
+ }
+}
+
+/* Return TRUE if INSN has defined any 32-bit value explicitly. */
+static bool insn_has_def32(struct bpf_insn *insn)
+{
+ int dst_reg = insn_def_regno(insn);
+
+ if (dst_reg == -1)
+ return false;
+
+ return !bpf_is_reg64(insn, dst_reg, NULL, DST_OP);
+}
+
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ if (d0->imm != d1->imm)
+ return d0->imm < d1->imm ? -1 : 1;
+ if (d0->offset != d1->offset)
+ return d0->offset < d1->offset ? -1 : 1;
+ return 0;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc desc = {
+ .imm = insn->imm,
+ .offset = insn->off,
+ };
+ const struct bpf_kfunc_desc *res;
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ res = bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
+
+ return res ? &res->func_model : NULL;
+}
+
+static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
+{
+ unsigned long call_imm;
+
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = desc->func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(desc->addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel func_id %u is out of range\n",
+ desc->func_id);
+ return -EINVAL;
+ }
+ }
+ desc->imm = call_imm;
+ return 0;
+}
+
+static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
+{
+ struct bpf_kfunc_desc_tab *tab;
+ int i, err;
+
+ tab = env->prog->aux->kfunc_tab;
+ if (!tab)
+ return 0;
+
+ for (i = 0; i < tab->nr_descs; i++) {
+ err = set_kfunc_desc_imm(env, &tab->descs[i]);
+ if (err)
+ return err;
+ }
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_imm_off, NULL);
+ return 0;
+}
+
+static int add_kfunc_in_insns(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int cnt)
+{
+ int i, ret;
+
+ for (i = 0; i < cnt; i++, insn++) {
+ if (bpf_pseudo_kfunc_call(insn)) {
+ ret = bpf_add_kfunc_call(env, insn->imm, insn->off);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+static int get_callee_stack_depth(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int idx)
+{
+ int start = idx + insn->imm + 1, subprog;
+
+ subprog = bpf_find_subprog(env, start);
+ if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
+ return -EFAULT;
+ return env->subprog_info[subprog].stack_depth;
+}
+#endif
+
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *data = env->insn_aux_data;
+ struct bpf_insn *insn = new_prog->insnsi;
+ u32 old_seen = data[off].seen;
+ u32 prog_len;
+ int i;
+
+ /* aux info at OFF always needs adjustment, no matter fast path
+ * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
+ * original insn at old prog.
+ */
+ data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);
+
+ if (cnt == 1)
+ return;
+ prog_len = new_prog->len;
+
+ memmove(data + off + cnt - 1, data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
+ for (i = off; i < off + cnt - 1; i++) {
+ /* Expand insni[off]'s seen count to the patched range. */
+ data[i].seen = old_seen;
+ data[i].zext_dst = insn_has_def32(insn + i);
+ }
+
+ /*
+ * The indirect_target flag of the original instruction was moved to the last of the
+ * new instructions by the above memmove and memset, but the indirect jump target is
+ * actually the first instruction, so move it back. This also matches with the behavior
+ * of bpf_insn_array_adjust(), which preserves xlated_off to point to the first new
+ * instruction.
+ */
+ if (data[off + cnt - 1].indirect_target) {
+ data[off].indirect_target = 1;
+ data[off + cnt - 1].indirect_target = 0;
+ }
+}
+
+static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+ /* NOTE: fake 'exit' subprog should be updated as well. */
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start <= off)
+ continue;
+ env->subprog_info[i].start += len - 1;
+ }
+}
+
+static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
+{
+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+ int i, sz = prog->aux->size_poke_tab;
+ struct bpf_jit_poke_descriptor *desc;
+
+ for (i = 0; i < sz; i++) {
+ desc = &tab[i];
+ if (desc->insn_idx <= off)
+ continue;
+ desc->insn_idx += len - 1;
+ }
+}
+
+struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+ struct bpf_insn_aux_data *new_data = NULL;
+
+ if (len > 1) {
+ new_data = vrealloc(env->insn_aux_data,
+ array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!new_data)
+ return NULL;
+
+ env->insn_aux_data = new_data;
+ }
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (IS_ERR(new_prog)) {
+ if (PTR_ERR(new_prog) == -ERANGE)
+ verbose(env,
+ "insn %d cannot be patched due to 16-bit range\n",
+ env->insn_aux_data[off].orig_idx);
+ return NULL;
+ }
+ adjust_insn_aux_data(env, new_prog, off, len);
+ adjust_subprog_starts(env, off, len);
+ adjust_insn_arrays(env, off, len);
+ adjust_poke_descs(new_prog, off, len);
+ return new_prog;
+}
+
+/*
+ * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
+ * jump offset by 'delta'.
+ */
+static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 insn_cnt = prog->len, i;
+ s32 imm;
+ s16 off;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code = insn->code;
+
+ if (tgt_idx <= i && i < tgt_idx + delta)
+ continue;
+
+ if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
+ BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
+ continue;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ if (i + 1 + insn->imm != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->imm, delta, &imm))
+ return -ERANGE;
+ insn->imm = imm;
+ } else {
+ if (i + 1 + insn->off != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->off, delta, &off))
+ return -ERANGE;
+ insn->off = off;
+ }
+ }
+ return 0;
+}
+
+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
+ u32 off, u32 cnt)
+{
+ int i, j;
+
+ /* find first prog starting at or after off (first to remove) */
+ for (i = 0; i < env->subprog_cnt; i++)
+ if (env->subprog_info[i].start >= off)
+ break;
+ /* find first prog starting at or after off + cnt (first to stay) */
+ for (j = i; j < env->subprog_cnt; j++)
+ if (env->subprog_info[j].start >= off + cnt)
+ break;
+ /* if j doesn't start exactly at off + cnt, we are just removing
+ * the front of previous prog
+ */
+ if (env->subprog_info[j].start != off + cnt)
+ j--;
+
+ if (j > i) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int move;
+
+ /* move fake 'exit' subprog as well */
+ move = env->subprog_cnt + 1 - j;
+
+ memmove(env->subprog_info + i,
+ env->subprog_info + j,
+ sizeof(*env->subprog_info) * move);
+ env->subprog_cnt -= j - i;
+
+ /* remove func_info */
+ if (aux->func_info) {
+ move = aux->func_info_cnt - j;
+
+ memmove(aux->func_info + i,
+ aux->func_info + j,
+ sizeof(*aux->func_info) * move);
+ aux->func_info_cnt -= j - i;
+ /* func_info->insn_off is set after all code rewrites,
+ * in adjust_btf_func() - no need to adjust
+ */
+ }
+ } else {
+ /* convert i from "first prog to remove" to "first to adjust" */
+ if (env->subprog_info[i].start == off)
+ i++;
+ }
+
+ /* update fake 'exit' subprog as well */
+ for (; i <= env->subprog_cnt; i++)
+ env->subprog_info[i].start -= cnt;
+
+ return 0;
+}
+
+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
+ u32 cnt)
+{
+ struct bpf_prog *prog = env->prog;
+ u32 i, l_off, l_cnt, nr_linfo;
+ struct bpf_line_info *linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo)
+ return 0;
+
+ linfo = prog->aux->linfo;
+
+ /* find first line info to remove, count lines to be removed */
+ for (i = 0; i < nr_linfo; i++)
+ if (linfo[i].insn_off >= off)
+ break;
+
+ l_off = i;
+ l_cnt = 0;
+ for (; i < nr_linfo; i++)
+ if (linfo[i].insn_off < off + cnt)
+ l_cnt++;
+ else
+ break;
+
+ /* First live insn doesn't match first live linfo, it needs to "inherit"
+ * last removed linfo. prog is already modified, so prog->len == off
+ * means no live instructions after (tail of the program was removed).
+ */
+ if (prog->len != off && l_cnt &&
+ (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
+ l_cnt--;
+ linfo[--i].insn_off = off + cnt;
+ }
+
+ /* remove the line info which refer to the removed instructions */
+ if (l_cnt) {
+ memmove(linfo + l_off, linfo + i,
+ sizeof(*linfo) * (nr_linfo - i));
+
+ prog->aux->nr_linfo -= l_cnt;
+ nr_linfo = prog->aux->nr_linfo;
+ }
+
+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
+ for (i = l_off; i < nr_linfo; i++)
+ linfo[i].insn_off -= cnt;
+
+ /* fix up all subprogs (incl. 'exit') which start >= off */
+ for (i = 0; i <= env->subprog_cnt; i++)
+ if (env->subprog_info[i].linfo_idx > l_off) {
+ /* program may have started in the removed region but
+ * may not be fully removed
+ */
+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
+ env->subprog_info[i].linfo_idx -= l_cnt;
+ else
+ env->subprog_info[i].linfo_idx = l_off;
+ }
+
+ return 0;
+}
+
+/*
+ * Clean up dynamically allocated fields of aux data for instructions [start, ...]
+ */
+void bpf_clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int end = start + len;
+ int i;
+
+ for (i = start; i < end; i++) {
+ if (aux_data[i].jt) {
+ kvfree(aux_data[i].jt);
+ aux_data[i].jt = NULL;
+ }
+
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+}
+
+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ unsigned int orig_prog_len = env->prog->len;
+ int err;
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
+ bpf_prog_offload_remove_insns(env, off, cnt);
+
+ /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
+ bpf_clear_insn_aux_data(env, off, cnt);
+
+ err = bpf_remove_insns(env->prog, off, cnt);
+ if (err)
+ return err;
+
+ err = adjust_subprog_starts_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ err = bpf_adj_linfo_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ adjust_insn_arrays_after_remove(env, off, cnt);
+
+ memmove(aux_data + off, aux_data + off + cnt,
+ sizeof(*aux_data) * (orig_prog_len - off - cnt));
+
+ return 0;
+}
+
+static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
+
+bool bpf_insn_is_cond_jump(u8 code)
+{
+ u8 op;
+
+ op = BPF_OP(code);
+ if (BPF_CLASS(code) == BPF_JMP32)
+ return op != BPF_JA;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ return false;
+
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
+}
+
+void bpf_opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_insn_is_cond_jump(insn->code))
+ continue;
+
+ if (!aux_data[i + 1].seen)
+ ja.off = insn->off;
+ else if (!aux_data[i + 1 + insn->off].seen)
+ ja.off = 0;
+ else
+ continue;
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
+ bpf_prog_offload_replace_insn(env, i, &ja);
+
+ memcpy(insn, &ja, sizeof(ja));
+ }
+}
+
+int bpf_opt_remove_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ int j;
+
+ j = 0;
+ while (i + j < insn_cnt && !aux_data[i + j].seen)
+ j++;
+ if (!j)
+ continue;
+
+ err = verifier_remove_insns(env, i, j);
+ if (err)
+ return err;
+ insn_cnt = env->prog->len;
+ }
+
+ return 0;
+}
+
+int bpf_opt_remove_nops(struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ bool is_may_goto_0, is_ja;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
+ is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
+
+ if (!is_may_goto_0 && !is_ja)
+ continue;
+
+ err = verifier_remove_insns(env, i, 1);
+ if (err)
+ return err;
+ insn_cnt--;
+ /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
+ i -= (is_may_goto_0 && i > 0) ? 2 : 1;
+ }
+
+ return 0;
+}
+
+int bpf_opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
+ const union bpf_attr *attr)
+{
+ struct bpf_insn *patch;
+ /* use env->insn_buf as two independent buffers */
+ struct bpf_insn *zext_patch = env->insn_buf;
+ struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i, patch_len, delta = 0, len = env->prog->len;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct bpf_prog *new_prog;
+ bool rnd_hi32;
+
+ rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
+ zext_patch[1] = BPF_ZEXT_REG(0);
+ rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
+ rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+ rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
+ for (i = 0; i < len; i++) {
+ int adj_idx = i + delta;
+ struct bpf_insn insn;
+ int load_reg;
+
+ insn = insns[adj_idx];
+ load_reg = insn_def_regno(&insn);
+ if (!aux[adj_idx].zext_dst) {
+ u8 code, class;
+ u32 imm_rnd;
+
+ if (!rnd_hi32)
+ continue;
+
+ code = insn.code;
+ class = BPF_CLASS(code);
+ if (load_reg == -1)
+ continue;
+
+ /* NOTE: arg "reg" (the fourth one) is only used for
+ * BPF_STX + SRC_OP, so it is safe to pass NULL
+ * here.
+ */
+ if (bpf_is_reg64(&insn, load_reg, NULL, DST_OP)) {
+ if (class == BPF_LD &&
+ BPF_MODE(code) == BPF_IMM)
+ i++;
+ continue;
+ }
+
+ /* ctx load could be transformed into wider load. */
+ if (class == BPF_LDX &&
+ aux[adj_idx].ptr_type == PTR_TO_CTX)
+ continue;
+
+ imm_rnd = get_random_u32();
+ rnd_hi32_patch[0] = insn;
+ rnd_hi32_patch[1].imm = imm_rnd;
+ rnd_hi32_patch[3].dst_reg = load_reg;
+ patch = rnd_hi32_patch;
+ patch_len = 4;
+ goto apply_patch_buffer;
+ }
+
+ /* Add in an zero-extend instruction if a) the JIT has requested
+ * it or b) it's a CMPXCHG.
+ *
+ * The latter is because: BPF_CMPXCHG always loads a value into
+ * R0, therefore always zero-extends. However some archs'
+ * equivalent instruction only does this load when the
+ * comparison is successful. This detail of CMPXCHG is
+ * orthogonal to the general zero-extension behaviour of the
+ * CPU, so it's treated independently of bpf_jit_needs_zext.
+ */
+ if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
+ continue;
+
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
+ continue;
+
+ if (verifier_bug_if(load_reg == -1, env,
+ "zext_dst is set, but no reg is defined"))
+ return -EFAULT;
+
+ zext_patch[0] = insn;
+ zext_patch[1].dst_reg = load_reg;
+ zext_patch[1].src_reg = load_reg;
+ patch = zext_patch;
+ patch_len = 2;
+apply_patch_buffer:
+ new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ insns = new_prog->insnsi;
+ aux = env->insn_aux_data;
+ delta += patch_len - 1;
+ }
+
+ return 0;
+}
+
+/* convert load instructions that access fields of a context type into a
+ * sequence of instructions that access fields of the underlying structure:
+ * struct __sk_buff -> struct sk_buff
+ * struct bpf_sock_ops -> struct sock
+ */
+int bpf_convert_ctx_accesses(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ const struct bpf_verifier_ops *ops = env->ops;
+ int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
+ const int insn_cnt = env->prog->len;
+ struct bpf_insn *epilogue_buf = env->epilogue_buf;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_insn *insn;
+ u32 target_size, size_default, off;
+ struct bpf_prog *new_prog;
+ enum bpf_access_type type;
+ bool is_narrower_load;
+ int epilogue_idx = 0;
+
+ if (ops->gen_epilogue) {
+ epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
+ -(subprogs[0].stack_depth + 8));
+ if (epilogue_cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "epilogue is too long");
+ return -EFAULT;
+ } else if (epilogue_cnt) {
+ /* Save the ARG_PTR_TO_CTX for the epilogue to use */
+ cnt = 0;
+ subprogs[0].stack_depth += 8;
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
+ -subprogs[0].stack_depth);
+ insn_buf[cnt++] = env->prog->insnsi[0];
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ if (ops->gen_prologue || env->seen_direct_write) {
+ if (!ops->gen_prologue) {
+ verifier_bug(env, "gen_prologue is null");
+ return -EFAULT;
+ }
+ cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+ env->prog);
+ if (cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "prologue is too long");
+ return -EFAULT;
+ } else if (cnt) {
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = new_prog;
+ delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ if (delta)
+ WARN_ON(adjust_jmp_off(env->prog, 0, delta));
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
+ return 0;
+
+ insn = env->prog->insnsi + delta;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ bpf_convert_ctx_access_t convert_ctx_access;
+ u8 mode;
+
+ if (env->insn_aux_data[i + delta].nospec) {
+ WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = BPF_ST_NOSPEC();
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ /* This can not be easily merged with the
+ * nospec_result-case, because an insn may require a
+ * nospec before and after itself. Therefore also do not
+ * 'continue' here but potentially apply further
+ * patching to insn. *insn should equal patch[1] now.
+ */
+ }
+
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
+ type = BPF_READ;
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
+ type = BPF_WRITE;
+ } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
+ env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
+ insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
+ } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
+ epilogue_cnt &&
+ i + delta < subprogs[1].start) {
+ /* Generate epilogue for the main prog */
+ if (epilogue_idx) {
+ /* jump back to the earlier generated epilogue */
+ insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
+ cnt = 1;
+ } else {
+ memcpy(insn_buf, epilogue_buf,
+ epilogue_cnt * sizeof(*epilogue_buf));
+ cnt = epilogue_cnt;
+ /* epilogue_idx cannot be 0. It must have at
+ * least one ctx ptr saving insn before the
+ * epilogue.
+ */
+ epilogue_idx = i + delta;
+ }
+ goto patch_insn_buf;
+ } else {
+ continue;
+ }
+
+ if (type == BPF_WRITE &&
+ env->insn_aux_data[i + delta].nospec_result) {
+ /* nospec_result is only used to mitigate Spectre v4 and
+ * to limit verification-time for Spectre v1.
+ */
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = *insn;
+ *patch++ = BPF_ST_NOSPEC();
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ switch ((int)env->insn_aux_data[i + delta].ptr_type) {
+ case PTR_TO_CTX:
+ if (!ops->convert_ctx_access)
+ continue;
+ convert_ctx_access = ops->convert_ctx_access;
+ break;
+ case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
+ convert_ctx_access = bpf_sock_convert_ctx_access;
+ break;
+ case PTR_TO_TCP_SOCK:
+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+ break;
+ case PTR_TO_XDP_SOCK:
+ convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
+ break;
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+ case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED:
+ if (type == BPF_READ) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ else
+ insn->code = BPF_LDX | BPF_PROBE_MEMSX |
+ BPF_SIZE((insn)->code);
+ env->prog->aux->num_exentries++;
+ }
+ continue;
+ case PTR_TO_ARENA:
+ if (BPF_MODE(insn->code) == BPF_MEMSX) {
+ if (!bpf_jit_supports_insn(insn, true)) {
+ verbose(env, "sign extending loads from arena are not supported yet\n");
+ return -EOPNOTSUPP;
+ }
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
+ } else {
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
+ }
+ env->prog->aux->num_exentries++;
+ continue;
+ default:
+ continue;
+ }
+
+ ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+ size = BPF_LDST_BYTES(insn);
+ mode = BPF_MODE(insn->code);
+
+ /* If the read access is a narrower load of the field,
+ * convert to a 4/8-byte load, to minimum program type specific
+ * convert_ctx_access changes. If conversion is successful,
+ * we will apply proper mask to the result.
+ */
+ is_narrower_load = size < ctx_field_size;
+ size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
+ off = insn->off;
+ if (is_narrower_load) {
+ u8 size_code;
+
+ if (type == BPF_WRITE) {
+ verifier_bug(env, "narrow ctx access misconfigured");
+ return -EFAULT;
+ }
+
+ size_code = BPF_H;
+ if (ctx_field_size == 4)
+ size_code = BPF_W;
+ else if (ctx_field_size == 8)
+ size_code = BPF_DW;
+
+ insn->off = off & ~(size_default - 1);
+ insn->code = BPF_LDX | BPF_MEM | size_code;
+ }
+
+ target_size = 0;
+ cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
+ &target_size);
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
+ (ctx_field_size && !target_size)) {
+ verifier_bug(env, "error during ctx access conversion (%d)", cnt);
+ return -EFAULT;
+ }
+
+ if (is_narrower_load && size < target_size) {
+ u8 shift = bpf_ctx_narrow_access_offset(
+ off, size, size_default) * 8;
+ if (shift && cnt + 1 >= INSN_BUF_SIZE) {
+ verifier_bug(env, "narrow ctx load misconfigured");
+ return -EFAULT;
+ }
+ if (ctx_field_size <= 4) {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ } else {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1ULL << size * 8) - 1);
+ }
+ }
+ if (mode == BPF_MEMSX)
+ insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
+ insn->dst_reg, insn->dst_reg,
+ size * 8, 0);
+
+patch_insn_buf:
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ return 0;
+}
+
+static u32 *bpf_dup_subprog_starts(struct bpf_verifier_env *env)
+{
+ u32 *starts = NULL;
+
+ starts = kvmalloc_objs(u32, env->subprog_cnt, GFP_KERNEL_ACCOUNT);
+ if (starts) {
+ for (int i = 0; i < env->subprog_cnt; i++)
+ starts[i] = env->subprog_info[i].start;
+ }
+ return starts;
+}
+
+static void bpf_restore_subprog_starts(struct bpf_verifier_env *env, u32 *orig_starts)
+{
+ for (int i = 0; i < env->subprog_cnt; i++)
+ env->subprog_info[i].start = orig_starts[i];
+ /* restore the start of fake 'exit' subprog as well */
+ env->subprog_info[env->subprog_cnt].start = env->prog->len;
+}
+
+struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env)
+{
+ size_t size;
+ void *new_aux;
+
+ size = array_size(sizeof(struct bpf_insn_aux_data), env->prog->len);
+ new_aux = __vmalloc(size, GFP_KERNEL_ACCOUNT);
+ if (new_aux)
+ memcpy(new_aux, env->insn_aux_data, size);
+ return new_aux;
+}
+
+void bpf_restore_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_insn_aux_data *orig_insn_aux)
+{
+ /* the expanded elements are zero-filled, so no special handling is required */
+ vfree(env->insn_aux_data);
+ env->insn_aux_data = orig_insn_aux;
+}
+
+static int jit_subprogs(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog, **func, *tmp;
+ int i, j, subprog_start, subprog_end = 0, len, subprog;
+ struct bpf_map *map_ptr;
+ struct bpf_insn *insn;
+ void *old_bpf_func;
+ int err, num_exentries;
+
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
+ continue;
+
+ /* Upon error here we cannot fall back to interpreter but
+ * need a hard reject of the program. Thus -EFAULT is
+ * propagated in any case.
+ */
+ subprog = bpf_find_subprog(env, i + insn->imm + 1);
+ if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
+ i + insn->imm + 1))
+ return -EFAULT;
+ /* temporarily remember subprog id inside insn instead of
+ * aux_data, since next loop will split up all insns into funcs
+ */
+ insn->off = subprog;
+ /* remember original imm in case JIT fails and fallback
+ * to interpreter will be needed
+ */
+ env->insn_aux_data[i].call_imm = insn->imm;
+ /* point imm to __bpf_call_base+1 from JITs point of view */
+ insn->imm = 1;
+ if (bpf_pseudo_func(insn)) {
+#if defined(MODULES_VADDR)
+ u64 addr = MODULES_VADDR;
+#else
+ u64 addr = VMALLOC_START;
+#endif
+ /* jit (e.g. x86_64) may emit fewer instructions
+ * if it learns a u32 imm is the same as a u64 imm.
+ * Set close enough to possible prog address.
+ */
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+ }
+ }
+
+ err = bpf_prog_alloc_jited_linfo(prog);
+ if (err)
+ goto out_undo_insn;
+
+ err = -ENOMEM;
+ func = kzalloc_objs(prog, env->subprog_cnt);
+ if (!func)
+ goto out_undo_insn;
+
+ for (i = 0; i < env->subprog_cnt; i++) {
+ subprog_start = subprog_end;
+ subprog_end = env->subprog_info[i + 1].start;
+
+ len = subprog_end - subprog_start;
+ /* bpf_prog_run() doesn't call subprogs directly,
+ * hence main prog stats include the runtime of subprogs.
+ * subprogs don't have IDs and not reachable via prog_get_next_id
+ * func[i]->stats will never be accessed and stays NULL
+ */
+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
+ if (!func[i])
+ goto out_free;
+ memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
+ len * sizeof(struct bpf_insn));
+ func[i]->type = prog->type;
+ func[i]->len = len;
+ if (bpf_prog_calc_tag(func[i]))
+ goto out_free;
+ func[i]->is_func = 1;
+ func[i]->sleepable = prog->sleepable;
+ func[i]->blinded = prog->blinded;
+ func[i]->aux->func_idx = i;
+ /* Below members will be freed only at prog->aux */
+ func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->subprog_start = subprog_start;
+ func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
+ func[i]->aux->poke_tab = prog->aux->poke_tab;
+ func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+ func[i]->aux->main_prog_aux = prog->aux;
+
+ for (j = 0; j < prog->aux->size_poke_tab; j++) {
+ struct bpf_jit_poke_descriptor *poke;
+
+ poke = &prog->aux->poke_tab[j];
+ if (poke->insn_idx < subprog_end &&
+ poke->insn_idx >= subprog_start)
+ poke->aux = func[i]->aux;
+ }
+
+ func[i]->aux->name[0] = 'F';
+ func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->jits_use_priv_stack = true;
+
+ func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
+ func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
+ func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
+ func[i]->aux->linfo = prog->aux->linfo;
+ func[i]->aux->nr_linfo = prog->aux->nr_linfo;
+ func[i]->aux->jited_linfo = prog->aux->jited_linfo;
+ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
+ func[i]->aux->arena = prog->aux->arena;
+ func[i]->aux->used_maps = env->used_maps;
+ func[i]->aux->used_map_cnt = env->used_map_cnt;
+ num_exentries = 0;
+ insn = func[i]->insnsi;
+ for (j = 0; j < func[i]->len; j++, insn++) {
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
+ num_exentries++;
+ if ((BPF_CLASS(insn->code) == BPF_STX ||
+ BPF_CLASS(insn->code) == BPF_ST) &&
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32)
+ num_exentries++;
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
+ num_exentries++;
+ }
+ func[i]->aux->num_exentries = num_exentries;
+ func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
+ func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
+ func[i]->aux->token = prog->aux->token;
+ if (!i)
+ func[i]->aux->exception_boundary = env->seen_exception;
+ func[i] = bpf_int_jit_compile(env, func[i]);
+ if (!func[i]->jited) {
+ err = -ENOTSUPP;
+ goto out_free;
+ }
+ cond_resched();
+ }
+
+ /* at this point all bpf functions were successfully JITed
+ * now populate all bpf_calls with correct addresses and
+ * run last pass of JIT
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ insn = func[i]->insnsi;
+ for (j = 0; j < func[i]->len; j++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ subprog = insn->off;
+ insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+ insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+ continue;
+ }
+ if (!bpf_pseudo_call(insn))
+ continue;
+ subprog = insn->off;
+ insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
+ }
+
+ /* we use the aux data to keep a list of the start addresses
+ * of the JITed images for each function in the program
+ *
+ * for some architectures, such as powerpc64, the imm field
+ * might not be large enough to hold the offset of the start
+ * address of the callee's JITed image from __bpf_call_base
+ *
+ * in such cases, we can lookup the start address of a callee
+ * by using its subprog id, available from the off field of
+ * the call instruction, as an index for this list
+ */
+ func[i]->aux->func = func;
+ func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ func[i]->aux->real_func_cnt = env->subprog_cnt;
+ }
+ for (i = 0; i < env->subprog_cnt; i++) {
+ old_bpf_func = func[i]->bpf_func;
+ tmp = bpf_int_jit_compile(env, func[i]);
+ if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
+ verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
+ err = -ENOTSUPP;
+ goto out_free;
+ }
+ cond_resched();
+ }
+
+ /*
+ * Cleanup func[i]->aux fields which aren't required
+ * or can become invalid in future
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ func[i]->aux->used_maps = NULL;
+ func[i]->aux->used_map_cnt = 0;
+ }
+
+ /* finally lock prog and jit images for all functions and
+ * populate kallsysm. Begin at the first subprogram, since
+ * bpf_prog_load will add the kallsyms for the main program.
+ */
+ for (i = 1; i < env->subprog_cnt; i++) {
+ err = bpf_prog_lock_ro(func[i]);
+ if (err)
+ goto out_free;
+ }
+
+ for (i = 1; i < env->subprog_cnt; i++)
+ bpf_prog_kallsyms_add(func[i]);
+
+ /* Last step: make now unused interpreter insns from main
+ * prog consistent for later dump requests, so they can
+ * later look the same as if they were interpreted only.
+ */
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ insn[0].imm = env->insn_aux_data[i].call_imm;
+ insn[1].imm = insn->off;
+ insn->off = 0;
+ continue;
+ }
+ if (!bpf_pseudo_call(insn))
+ continue;
+ insn->off = env->insn_aux_data[i].call_imm;
+ subprog = bpf_find_subprog(env, i + insn->off + 1);
+ insn->imm = subprog;
+ }
+
+ prog->jited = 1;
+ prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
+ prog->aux->extable = func[0]->aux->extable;
+ prog->aux->num_exentries = func[0]->aux->num_exentries;
+ prog->aux->func = func;
+ prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ prog->aux->real_func_cnt = env->subprog_cnt;
+ prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+ prog->aux->exception_boundary = func[0]->aux->exception_boundary;
+ bpf_prog_jit_attempt_done(prog);
+ return 0;
+out_free:
+ /* We failed JIT'ing, so at this point we need to unregister poke
+ * descriptors from subprogs, so that kernel is not attempting to
+ * patch it anymore as we're freeing the subprog JIT memory.
+ */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+ }
+ /* At this point we're guaranteed that poke descriptors are not
+ * live anymore. We can just unlink its descriptor table as it's
+ * released with the main prog.
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (!func[i])
+ continue;
+ func[i]->aux->poke_tab = NULL;
+ bpf_jit_free(func[i]);
+ }
+ kfree(func);
+out_undo_insn:
+ bpf_prog_jit_attempt_done(prog);
+ return err;
+}
+
+int bpf_jit_subprogs(struct bpf_verifier_env *env)
+{
+ int err, i;
+ bool blinded = false;
+ struct bpf_insn *insn;
+ struct bpf_prog *prog, *orig_prog;
+ struct bpf_insn_aux_data *orig_insn_aux;
+ u32 *orig_subprog_starts;
+
+ if (env->subprog_cnt <= 1)
+ return 0;
+
+ prog = orig_prog = env->prog;
+ if (bpf_prog_need_blind(prog)) {
+ orig_insn_aux = bpf_dup_insn_aux_data(env);
+ if (!orig_insn_aux) {
+ err = -ENOMEM;
+ goto out_cleanup;
+ }
+ orig_subprog_starts = bpf_dup_subprog_starts(env);
+ if (!orig_subprog_starts) {
+ vfree(orig_insn_aux);
+ err = -ENOMEM;
+ goto out_cleanup;
+ }
+ prog = bpf_jit_blind_constants(env, prog);
+ if (IS_ERR(prog)) {
+ err = -ENOMEM;
+ prog = orig_prog;
+ goto out_restore;
+ }
+ blinded = true;
+ }
+
+ err = jit_subprogs(env);
+ if (err)
+ goto out_jit_err;
+
+ if (blinded) {
+ bpf_jit_prog_release_other(prog, orig_prog);
+ kvfree(orig_subprog_starts);
+ vfree(orig_insn_aux);
+ }
+
+ return 0;
+
+out_jit_err:
+ if (blinded) {
+ bpf_jit_prog_release_other(orig_prog, prog);
+ /* roll back to the clean original prog */
+ prog = env->prog = orig_prog;
+ goto out_restore;
+ } else {
+ if (err != -EFAULT) {
+ /*
+ * We will fall back to interpreter mode when err is not -EFAULT, before
+ * that, insn->off and insn->imm should be restored to their original
+ * values since they were modified by jit_subprogs.
+ */
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (!bpf_pseudo_call(insn))
+ continue;
+ insn->off = 0;
+ insn->imm = env->insn_aux_data[i].call_imm;
+ }
+ }
+ goto out_cleanup;
+ }
+
+out_restore:
+ bpf_restore_subprog_starts(env, orig_subprog_starts);
+ bpf_restore_insn_aux_data(env, orig_insn_aux);
+ kvfree(orig_subprog_starts);
+out_cleanup:
+ /* cleanup main prog to be interpreted */
+ prog->jit_requested = 0;
+ prog->blinding_requested = 0;
+ return err;
+}
+
+int bpf_fixup_call_args(struct bpf_verifier_env *env)
+{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
+ int i, depth;
+#endif
+ int err = 0;
+
+ if (env->prog->jit_requested &&
+ !bpf_prog_is_offloaded(env->prog->aux)) {
+ err = bpf_jit_subprogs(env);
+ if (err == 0)
+ return 0;
+ if (err == -EFAULT)
+ return err;
+ }
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ if (has_kfunc_call) {
+ verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+ if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
+ /* When JIT fails the progs with bpf2bpf calls and tail_calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
+ return -EINVAL;
+ }
+ for (i = 0; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* When JIT fails the progs with callback calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "callbacks are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+
+ if (!bpf_pseudo_call(insn))
+ continue;
+ depth = get_callee_stack_depth(env, insn, i);
+ if (depth < 0)
+ return depth;
+ bpf_patch_call_args(insn, depth);
+ }
+ err = 0;
+#endif
+ return err;
+}
+
+
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+ struct bpf_subprog_info *info = env->subprog_info;
+ int cnt = env->subprog_cnt;
+ struct bpf_prog *prog;
+
+ /* We only reserve one slot for hidden subprogs in subprog_info. */
+ if (env->hidden_subprog_cnt) {
+ verifier_bug(env, "only one hidden subprog supported");
+ return -EFAULT;
+ }
+ /* We're not patching any existing instruction, just appending the new
+ * ones for the hidden subprog. Hence all of the adjustment operations
+ * in bpf_patch_insn_data are no-ops.
+ */
+ prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+ if (!prog)
+ return -ENOMEM;
+ env->prog = prog;
+ info[cnt + 1].start = info[cnt].start;
+ info[cnt].start = prog->len - len + 1;
+ env->subprog_cnt++;
+ env->hidden_subprog_cnt++;
+ return 0;
+}
+
+/* Do various post-verification rewrites in a single program pass.
+ * These rewrites simplify JIT and interpreter implementations.
+ */
+int bpf_do_misc_fixups(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ struct bpf_insn *insn = prog->insnsi;
+ const struct bpf_func_proto *fn;
+ const int insn_cnt = prog->len;
+ const struct bpf_map_ops *ops;
+ struct bpf_insn_aux_data *aux;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_prog *new_prog;
+ struct bpf_map *map_ptr;
+ int i, ret, cnt, delta = 0, cur_subprog = 0;
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_extra = 0;
+
+ if (env->seen_exception && !env->exception_callback_subprog) {
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = env->prog->insnsi[insn_cnt - 1];
+ *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *patch++ = BPF_EXIT_INSN();
+ ret = add_hidden_subprog(env, insn_buf, patch - insn_buf);
+ if (ret < 0)
+ return ret;
+ prog = env->prog;
+ insn = prog->insnsi;
+
+ env->exception_callback_subprog = env->subprog_cnt - 1;
+ /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+ bpf_mark_subprog_exc_cb(env, env->exception_callback_subprog);
+ }
+
+ for (i = 0; i < insn_cnt;) {
+ if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
+ if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
+ (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
+ /* convert to 32-bit mov that clears upper 32-bit */
+ insn->code = BPF_ALU | BPF_MOV | BPF_X;
+ /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
+ insn->off = 0;
+ insn->imm = 0;
+ } /* cast from as(0) to as(1) should be handled by JIT */
+ goto next_insn;
+ }
+
+ if (env->insn_aux_data[i + delta].needs_zext)
+ /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
+ insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
+
+ /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
+ if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
+ insn->off == 1 && insn->imm == -1) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ struct bpf_insn *patch = insn_buf;
+
+ if (isdiv)
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ else
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
+ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ bool is_sdiv = isdiv && insn->off == 1;
+ bool is_smod = !isdiv && insn->off == 1;
+ struct bpf_insn *patch = insn_buf;
+
+ if (is_sdiv) {
+ /* [R,W]x sdiv 0 -> 0
+ * LLONG_MIN sdiv -1 -> LLONG_MIN
+ * INT_MIN sdiv -1 -> INT_MIN
+ */
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 4, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 1, 0);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_MOV | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else if (is_smod) {
+ /* [R,W]x mod 0 -> [R,W]x */
+ /* [R,W]x mod -1 -> 0 */
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 3, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 3 + (is64 ? 0 : 1), 1);
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ } else if (isdiv) {
+ /* [R,W]x div 0 -> 0 */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JNE | BPF_K, insn->src_reg,
+ 0, 2, 0);
+ *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else {
+ /* [R,W]x mod 0 -> [R,W]x */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, insn->src_reg,
+ 0, 1 + (is64 ? 0 : 1), 0);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make it impossible to de-reference a userspace address */
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+ struct bpf_insn *patch = insn_buf;
+ u64 uaddress_limit = bpf_arch_uaddress_limit();
+
+ if (!uaddress_limit)
+ goto next_insn;
+
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ if (insn->off)
+ *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+ *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+ *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+ *patch++ = *insn;
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
+ if (BPF_CLASS(insn->code) == BPF_LD &&
+ (BPF_MODE(insn->code) == BPF_ABS ||
+ BPF_MODE(insn->code) == BPF_IND)) {
+ cnt = env->ops->gen_ld_abs(insn, insn_buf);
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "%d insns generated for ld_abs", cnt);
+ return -EFAULT;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Rewrite pointer arithmetic to mitigate speculation attacks. */
+ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
+ const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
+ const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
+ struct bpf_insn *patch = insn_buf;
+ bool issrc, isneg, isimm;
+ u32 off_reg;
+
+ aux = &env->insn_aux_data[i + delta];
+ if (!aux->alu_state ||
+ aux->alu_state == BPF_ALU_NON_POINTER)
+ goto next_insn;
+
+ isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
+ issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
+ BPF_ALU_SANITIZE_SRC;
+ isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
+
+ off_reg = issrc ? insn->src_reg : insn->dst_reg;
+ if (isimm) {
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+ } else {
+ if (isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+ *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+ *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+ *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
+ }
+ if (!issrc)
+ *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
+ insn->src_reg = BPF_REG_AX;
+ if (isneg)
+ insn->code = insn->code == code_add ?
+ code_sub : code_add;
+ *patch++ = *insn;
+ if (issrc && isneg && !isimm)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (bpf_is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
+ int stack_off_cnt = -stack_depth - 16;
+
+ /*
+ * Two 8 byte slots, depth-16 stores the count, and
+ * depth-8 stores the start timestamp of the loop.
+ *
+ * The starting value of count is BPF_MAX_TIMED_LOOPS
+ * (0xffff). Every iteration loads it and subs it by 1,
+ * until the value becomes 0 in AX (thus, 1 in stack),
+ * after which we call arch_bpf_timed_may_goto, which
+ * either sets AX to 0xffff to keep looping, or to 0
+ * upon timeout. AX is then stored into the stack. In
+ * the next iteration, we either see 0 and break out, or
+ * continue iterating until the next time value is 0
+ * after subtraction, rinse and repeat.
+ */
+ stack_depth_extra = 16;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
+ /*
+ * AX is used as an argument to pass in stack_off_cnt
+ * (to add to r10/fp), and also as the return value of
+ * the call to arch_bpf_timed_may_goto.
+ */
+ insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
+ insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
+ insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
+ cnt = 7;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ } else if (bpf_is_may_goto_insn(insn)) {
+ int stack_off = -stack_depth - 8;
+
+ stack_depth_extra = 8;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
+ cnt = 4;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ goto next_insn;
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ goto next_insn;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ ret = bpf_fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
+ if (ret)
+ return ret;
+ if (cnt == 0)
+ goto next_insn;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Skip inlining the helper call if the JIT does it. */
+ if (bpf_jit_inlines_helper_call(insn->imm))
+ goto next_insn;
+
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_override_return)
+ prog->kprobe_override = 1;
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* If we tail call into other programs, we
+ * cannot make any assumptions since they can
+ * be replaced dynamically during runtime in
+ * the program array.
+ */
+ prog->cb_access = 1;
+ if (!bpf_allow_tail_call_in_subprogs(env))
+ prog->aux->stack_depth = MAX_BPF_STACK;
+ prog->aux->max_pkt_offset = MAX_PACKET_OFF;
+
+ /* mark bpf_tail_call as different opcode to avoid
+ * conditional branch in the interpreter for every normal
+ * call and to prevent accidental JITing by JIT compiler
+ * that doesn't support bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ aux = &env->insn_aux_data[i + delta];
+ if (env->bpf_capable && !prog->blinding_requested &&
+ prog->jit_requested &&
+ !bpf_map_key_poisoned(aux) &&
+ !bpf_map_ptr_poisoned(aux) &&
+ !bpf_map_ptr_unpriv(aux)) {
+ struct bpf_jit_poke_descriptor desc = {
+ .reason = BPF_POKE_REASON_TAIL_CALL,
+ .tail_call.map = aux->map_ptr_state.map_ptr,
+ .tail_call.key = bpf_map_key_immediate(aux),
+ .insn_idx = i + delta,
+ };
+
+ ret = bpf_jit_add_poke_descriptor(prog, &desc);
+ if (ret < 0) {
+ verbose(env, "adding tail call poke descriptor failed\n");
+ return ret;
+ }
+
+ insn->imm = ret + 1;
+ goto next_insn;
+ }
+
+ if (!bpf_map_ptr_unpriv(aux))
+ goto next_insn;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ if (bpf_map_ptr_poisoned(aux)) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+
+ map_ptr = aux->map_ptr_state.map_ptr;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (insn->imm == BPF_FUNC_timer_set_callback) {
+ /* The verifier will process callback_fn as many times as necessary
+ * with different maps and the register states prepared by
+ * set_timer_callback_state will be accurate.
+ *
+ * The following use case is valid:
+ * map1 is shared by prog1, prog2, prog3.
+ * prog1 calls bpf_timer_init for some map1 elements
+ * prog2 calls bpf_timer_set_callback for some map1 elements.
+ * Those that were not bpf_timer_init-ed will return -EINVAL.
+ * prog3 calls bpf_timer_start for some map1 elements.
+ * Those that were not both bpf_timer_init-ed and
+ * bpf_timer_set_callback-ed will return -EINVAL.
+ */
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+ if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+ /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+ * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
+ * and other inlining handlers are currently limited to 64 bit
+ * only.
+ */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ (insn->imm == BPF_FUNC_map_lookup_elem ||
+ insn->imm == BPF_FUNC_map_update_elem ||
+ insn->imm == BPF_FUNC_map_delete_elem ||
+ insn->imm == BPF_FUNC_map_push_elem ||
+ insn->imm == BPF_FUNC_map_pop_elem ||
+ insn->imm == BPF_FUNC_map_peek_elem ||
+ insn->imm == BPF_FUNC_redirect_map ||
+ insn->imm == BPF_FUNC_for_each_map_elem ||
+ insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
+ aux = &env->insn_aux_data[i + delta];
+ if (bpf_map_ptr_poisoned(aux))
+ goto patch_call_imm;
+
+ map_ptr = aux->map_ptr_state.map_ptr;
+ ops = map_ptr->ops;
+ if (insn->imm == BPF_FUNC_map_lookup_elem &&
+ ops->map_gen_lookup) {
+ cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+ if (cnt == -EOPNOTSUPP)
+ goto patch_map_ops_generic;
+ if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "%d insns generated for map lookup", cnt);
+ return -EFAULT;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta,
+ insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+ (long (*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+ (long (*)(struct bpf_map *map, void *key, void *value,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_push_elem,
+ (long (*)(struct bpf_map *map, void *value,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
+ (long (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
+ (long (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_redirect,
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
+ (long (*)(struct bpf_map *map,
+ bpf_callback_t callback_fn,
+ void *callback_ctx,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+ (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
+
+patch_map_ops_generic:
+ switch (insn->imm) {
+ case BPF_FUNC_map_lookup_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
+ goto next_insn;
+ case BPF_FUNC_map_update_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_update_elem);
+ goto next_insn;
+ case BPF_FUNC_map_delete_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
+ goto next_insn;
+ case BPF_FUNC_map_push_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_push_elem);
+ goto next_insn;
+ case BPF_FUNC_map_pop_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
+ goto next_insn;
+ case BPF_FUNC_map_peek_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
+ goto next_insn;
+ case BPF_FUNC_redirect_map:
+ insn->imm = BPF_CALL_IMM(ops->map_redirect);
+ goto next_insn;
+ case BPF_FUNC_for_each_map_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
+ goto next_insn;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
+ goto next_insn;
+ }
+
+ goto patch_call_imm;
+ }
+
+ /* Implement bpf_jiffies64 inline. */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_jiffies64) {
+ struct bpf_insn ld_jiffies_addr[2] = {
+ BPF_LD_IMM64(BPF_REG_0,
+ (unsigned long)&jiffies),
+ };
+
+ insn_buf[0] = ld_jiffies_addr[0];
+ insn_buf[1] = ld_jiffies_addr[1];
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
+ BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ /* Implement bpf_get_smp_processor_id() inline. */
+ if (insn->imm == BPF_FUNC_get_smp_processor_id &&
+ bpf_verifier_inlines_helper_call(env, insn->imm)) {
+ /* BPF_FUNC_get_smp_processor_id inlining is an
+ * optimization, so if cpu_number is ever
+ * changed in some incompatible and hard to support
+ * way, it's fine to back out this inlining logic
+ */
+#ifdef CONFIG_SMP
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+#else
+ insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ cnt = 1;
+#endif
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */
+ if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) &&
+ bpf_verifier_inlines_helper_call(env, insn->imm)) {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&current_task);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+#endif
+ /* Implement bpf_get_func_arg inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg) {
+ if (eatype == BPF_TRACE_RAW_TP) {
+ int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
+
+ /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
+ cnt = 1;
+ } else {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ cnt = 2;
+ }
+ insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[cnt++] = BPF_JMP_A(1);
+ insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_func_ret inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ret) {
+ if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_TRACE_FSESSION ||
+ eatype == BPF_MODIFY_RETURN) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 7;
+ } else {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+ cnt = 1;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement get_func_arg_cnt inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg_cnt) {
+ if (eatype == BPF_TRACE_RAW_TP) {
+ int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
+
+ /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
+ cnt = 1;
+ } else {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ cnt = 2;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_func_ip inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ip) {
+ /* Load IP address from ctx - 16 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_branch_snapshot inline. */
+ if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
+ prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_get_branch_snapshot) {
+ /* We are dealing with the following func protos:
+ * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+ * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+ */
+ const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+ /* struct perf_branch_entry is part of UAPI and is
+ * used as an array element, so extremely unlikely to
+ * ever grow or shrink
+ */
+ BUILD_BUG_ON(br_entry_size != 24);
+
+ /* if (unlikely(flags)) return -EINVAL */
+ insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+ /* Transform size (bytes) into number of entries (cnt = size / 24).
+ * But to avoid expensive division instruction, we implement
+ * divide-by-3 through multiplication, followed by further
+ * division by 8 through 3-bit right shift.
+ * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+ * p. 227, chapter "Unsigned Division by 3" for details and proofs.
+ *
+ * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+ */
+ insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+ insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+ /* call perf_snapshot_branch_stack implementation */
+ insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+ /* if (entry_cnt == 0) return -ENOENT */
+ insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+ /* return entry_cnt * sizeof(struct perf_branch_entry) */
+ insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+ insn_buf[7] = BPF_JMP_A(3);
+ /* return -EINVAL; */
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ insn_buf[9] = BPF_JMP_A(1);
+ /* return -ENOENT; */
+ insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+ cnt = 11;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_kptr_xchg inline */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_kptr_xchg &&
+ bpf_jit_supports_ptr_xchg()) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
+ insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+patch_call_imm:
+ fn = env->ops->get_func_proto(insn->imm, env->prog);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ if (!fn->func) {
+ verifier_bug(env,
+ "not inlined functions %s#%d is missing func",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+ insn->imm = fn->func - __bpf_call_base;
+next_insn:
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ subprogs[cur_subprog].stack_extra = stack_depth_extra;
+
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
+ verbose(env, "stack size %d(extra %d) is too large\n",
+ stack_depth, stack_depth_extra);
+ return -EINVAL;
+ }
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_extra = 0;
+ }
+ i++;
+ insn++;
+ }
+
+ env->prog->aux->stack_depth = subprogs[0].stack_depth;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
+ int subprog_start = subprogs[i].start;
+ int stack_slots = subprogs[i].stack_extra / 8;
+ int slots = delta, cnt = 0;
+
+ if (!stack_slots)
+ continue;
+ /* We need two slots in case timed may_goto is supported. */
+ if (stack_slots > slots) {
+ verifier_bug(env, "stack_slots supports may_goto only");
+ return -EFAULT;
+ }
+
+ stack_depth = subprogs[i].stack_depth;
+ if (bpf_jit_supports_timed_may_goto()) {
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_TIMED_LOOPS);
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
+ } else {
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_LOOPS);
+ }
+ /* Copy first actual insn to preserve it */
+ insn_buf[cnt++] = env->prog->insnsi[subprog_start];
+
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = prog = new_prog;
+ /*
+ * If may_goto is a first insn of a prog there could be a jmp
+ * insn that points to it, hence adjust all such jmps to point
+ * to insn after BPF_ST that inits may_goto count.
+ * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
+ */
+ WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
+ }
+
+ /* Since poke tab is now finalized, publish aux to tracker. */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ if (!map_ptr->ops->map_poke_track ||
+ !map_ptr->ops->map_poke_untrack ||
+ !map_ptr->ops->map_poke_run) {
+ verifier_bug(env, "poke tab is misconfigured");
+ return -EFAULT;
+ }
+
+ ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
+ if (ret < 0) {
+ verbose(env, "tracking tail call prog failed\n");
+ return ret;
+ }
+ }
+
+ ret = sort_kfunc_descs_by_imm_off(env);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+ int position,
+ s32 stack_base,
+ u32 callback_subprogno,
+ u32 *total_cnt)
+{
+ s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+ s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+ s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+ int reg_loop_max = BPF_REG_6;
+ int reg_loop_cnt = BPF_REG_7;
+ int reg_loop_ctx = BPF_REG_8;
+
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_prog *new_prog;
+ u32 callback_start;
+ u32 call_insn_offset;
+ s32 callback_offset;
+ u32 cnt = 0;
+
+ /* This represents an inlined version of bpf_iter.c:bpf_loop,
+ * be careful to modify this code in sync.
+ */
+
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
+ insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
+ /* spill R6, R7, R8 to use these as loop vars */
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
+ /* initialize loop vars */
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
+ insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
+ insn_buf[cnt++] = BPF_CALL_REL(0);
+ /* increment loop counter */
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
+ /* jump to loop header if callback returned 0 */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
+ /* restore original values of R6, R7, R8 */
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
+
+ *total_cnt = cnt;
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
+ if (!new_prog)
+ return new_prog;
+
+ /* callback start is known only after patching */
+ callback_start = env->subprog_info[callback_subprogno].start;
+ /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+ call_insn_offset = position + 12;
+ callback_offset = callback_start - call_insn_offset - 1;
+ new_prog->insnsi[call_insn_offset].imm = callback_offset;
+
+ return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8. These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+int bpf_optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ int i, cur_subprog = 0, cnt, delta = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ u16 stack_depth_extra = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ struct bpf_loop_inline_state *inline_state =
+ &env->insn_aux_data[i + delta].loop_inline_state;
+
+ if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+ struct bpf_prog *new_prog;
+
+ stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+ new_prog = inline_bpf_loop(env,
+ i + delta,
+ -(stack_depth + stack_depth_extra),
+ inline_state->callback_subprogno,
+ &cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ stack_depth_extra = 0;
+ }
+ }
+
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
+ return 0;
+}
+
+/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
+ * adjust subprograms stack depth when possible.
+ */
+int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u32 spills_num;
+ bool modified = false;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (aux[i].fastcall_spills_num > 0) {
+ spills_num = aux[i].fastcall_spills_num;
+ /* NOPs would be removed by opt_remove_nops() */
+ for (j = 1; j <= spills_num; ++j) {
+ *(insn - j) = NOP;
+ *(insn + j) = NOP;
+ }
+ modified = true;
+ }
+ if ((subprog + 1)->start == i + 1) {
+ if (modified && !subprog->keep_fastcall_stack)
+ subprog->stack_depth = -subprog->fastcall_stack_off;
+ subprog++;
+ modified = false;
+ }
+ }
+
+ return 0;
+}
+
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3b9d297a53be..3dd9b4924ae4 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -125,6 +125,11 @@ struct htab_elem {
char key[] __aligned(8);
};
+struct htab_btf_record {
+ struct btf_record *record;
+ u32 key_size;
+};
+
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
@@ -457,6 +462,83 @@ static int htab_map_alloc_check(union bpf_attr *attr)
return 0;
}
+static void htab_mem_dtor(void *obj, void *ctx)
+{
+ struct htab_btf_record *hrec = ctx;
+ struct htab_elem *elem = obj;
+ void *map_value;
+
+ if (IS_ERR_OR_NULL(hrec->record))
+ return;
+
+ map_value = htab_elem_value(elem, hrec->key_size);
+ bpf_obj_free_fields(hrec->record, map_value);
+}
+
+static void htab_pcpu_mem_dtor(void *obj, void *ctx)
+{
+ void __percpu *pptr = *(void __percpu **)obj;
+ struct htab_btf_record *hrec = ctx;
+ int cpu;
+
+ if (IS_ERR_OR_NULL(hrec->record))
+ return;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu));
+}
+
+static void htab_dtor_ctx_free(void *ctx)
+{
+ struct htab_btf_record *hrec = ctx;
+
+ btf_record_free(hrec->record);
+ kfree(ctx);
+}
+
+static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
+{
+ u32 key_size = htab->map.key_size;
+ struct bpf_mem_alloc *ma;
+ struct htab_btf_record *hrec;
+ int err;
+
+ /* No need for dtors. */
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return 0;
+
+ hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
+ if (!hrec)
+ return -ENOMEM;
+ hrec->key_size = key_size;
+ hrec->record = btf_record_dup(htab->map.record);
+ if (IS_ERR(hrec->record)) {
+ err = PTR_ERR(hrec->record);
+ kfree(hrec);
+ return err;
+ }
+ ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
+ bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
+ return 0;
+}
+
+static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type, const struct btf_type *value_type)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ if (htab_is_prealloc(htab))
+ return 0;
+ /*
+ * We must set the dtor using this callback, as map's BTF record is not
+ * populated in htab_map_alloc(), so it will always appear as NULL.
+ */
+ if (htab_is_percpu(htab))
+ return htab_set_dtor(htab, htab_pcpu_mem_dtor);
+ else
+ return htab_set_dtor(htab, htab_mem_dtor);
+}
+
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -974,7 +1056,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
for_each_possible_cpu(cpu) {
if (cpu == current_cpu)
- copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
+ copy_map_value(&htab->map, per_cpu_ptr(pptr, cpu), value);
else /* Since elem is preallocated, we cannot touch special fields */
zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
@@ -1056,6 +1138,10 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
} else if (fd_htab_map_needs_adjust(htab)) {
size = round_up(size, 8);
memcpy(htab_elem_value(l_new, key_size), value, size);
+ } else if (map_flags & BPF_F_LOCK) {
+ copy_map_value_locked(&htab->map,
+ htab_elem_value(l_new, key_size),
+ value, false);
} else {
copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
}
@@ -2281,6 +2367,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2303,6 +2390,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2482,6 +2570,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2502,6 +2591,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
.map_btf_id = &htab_map_btf_ids[0],
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6eb6c82ed2ee..2bb60200c266 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -845,7 +845,13 @@ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
data->buf = buffers->buf;
for (i = 0; i < fmt_size; i++) {
- if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+ unsigned char c = fmt[i];
+
+ /*
+ * Permit bytes >= 0x80 in plain text so UTF-8 literals can pass
+ * through unchanged, while still rejecting ASCII control bytes.
+ */
+ if (isascii(c) && !isprint(c) && !isspace(c)) {
err = -EINVAL;
goto out;
}
@@ -867,6 +873,15 @@ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
* always access fmt[i + 1], in the worst case it will be a 0
*/
i++;
+ c = fmt[i];
+ /*
+ * The format parser below only understands ASCII conversion
+ * specifiers and modifiers, so reject non-ASCII after '%'.
+ */
+ if (!isascii(c)) {
+ err = -EINVAL;
+ goto out;
+ }
/* skip optional "[0 +-][num]" width formatting field */
while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
@@ -1272,7 +1287,7 @@ static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu)
return;
}
- /* rcu_trace_implies_rcu_gp() is true and will remain so */
+ /* RCU Tasks Trace grace period implies RCU grace period. */
bpf_async_cb_rcu_free(rcu);
}
@@ -2302,9 +2317,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
__bpf_kfunc_start_defs();
-__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+/**
+ * bpf_obj_new() - allocate an object described by program BTF
+ * @local_type_id__k: type ID in program BTF
+ * @meta: verifier-supplied struct metadata
+ *
+ * Allocate an object of the type identified by @local_type_id__k and
+ * initialize its special fields. BPF programs can use
+ * bpf_core_type_id_local() to provide @local_type_id__k. The verifier
+ * rewrites @meta; BPF programs do not set it.
+ *
+ * Return: Pointer to the allocated object, or %NULL on failure.
+ */
+__bpf_kfunc void *bpf_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
{
- struct btf_struct_meta *meta = meta__ign;
u64 size = local_type_id__k;
void *p;
@@ -2313,17 +2339,39 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
return NULL;
if (meta)
bpf_obj_init(meta->record, p);
+
return p;
}
-__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ return bpf_obj_new(local_type_id__k, meta__ign);
+}
+
+/**
+ * bpf_percpu_obj_new() - allocate a percpu object described by program BTF
+ * @local_type_id__k: type ID in program BTF
+ * @meta: verifier-supplied struct metadata
+ *
+ * Allocate a percpu object of the type identified by @local_type_id__k. BPF
+ * programs can use bpf_core_type_id_local() to provide @local_type_id__k.
+ * The verifier rewrites @meta; BPF programs do not set it.
+ *
+ * Return: Pointer to the allocated percpu object, or %NULL on failure.
+ */
+__bpf_kfunc void *bpf_percpu_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
{
u64 size = local_type_id__k;
- /* The verifier has ensured that meta__ign must be NULL */
+ /* The verifier has ensured that meta must be NULL */
return bpf_mem_alloc(&bpf_global_percpu_ma, size);
}
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ return bpf_percpu_obj_new(local_type_id__k, meta__ign);
+}
+
/* Must be called under migrate_disable(), as required by bpf_mem_free */
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
{
@@ -2347,23 +2395,56 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
bpf_mem_free_rcu(ma, p);
}
-__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+/**
+ * bpf_obj_drop() - drop a previously allocated object
+ * @p__alloc: object to free
+ * @meta: verifier-supplied struct metadata
+ *
+ * Destroy special fields in @p__alloc as needed and free the object. The
+ * verifier rewrites @meta; BPF programs do not set it.
+ */
+__bpf_kfunc void bpf_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
{
- struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
}
-__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ return bpf_obj_drop(p__alloc, meta__ign);
+}
+
+/**
+ * bpf_percpu_obj_drop() - drop a previously allocated percpu object
+ * @p__alloc: percpu object to free
+ * @meta: verifier-supplied struct metadata
+ *
+ * Free @p__alloc. The verifier rewrites @meta; BPF programs do not set it.
+ */
+__bpf_kfunc void bpf_percpu_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
{
- /* The verifier has ensured that meta__ign must be NULL */
+ /* The verifier has ensured that meta must be NULL */
bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
}
-__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ bpf_percpu_obj_drop(p__alloc, meta__ign);
+}
+
+/**
+ * bpf_refcount_acquire() - turn a local kptr into an owning reference
+ * @p__refcounted_kptr: non-owning local kptr
+ * @meta: verifier-supplied struct metadata
+ *
+ * Increment the refcount for @p__refcounted_kptr. The verifier rewrites
+ * @meta; BPF programs do not set it.
+ *
+ * Return: Owning reference to @p__refcounted_kptr, or %NULL on failure.
+ */
+__bpf_kfunc void *bpf_refcount_acquire(void *p__refcounted_kptr, struct btf_struct_meta *meta)
{
- struct btf_struct_meta *meta = meta__ign;
struct bpf_refcount *ref;
/* Could just cast directly to refcount_t *, but need some code using
@@ -2379,6 +2460,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
return (void *)p__refcounted_kptr;
}
+__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+{
+ return bpf_refcount_acquire(p__refcounted_kptr, meta__ign);
+}
+
static int __bpf_list_add(struct bpf_list_node_kern *node,
struct bpf_list_head *head,
bool tail, struct btf_record *rec, u64 off)
@@ -2406,24 +2492,62 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
return 0;
}
+/**
+ * bpf_list_push_front() - add a node to the front of a BPF linked list
+ * @head: list head
+ * @node: node to insert
+ * @meta: verifier-supplied struct metadata
+ * @off: verifier-supplied offset of @node within the containing object
+ *
+ * Insert @node at the front of @head. The verifier rewrites @meta and @off;
+ * BPF programs do not set them.
+ *
+ * Return: 0 on success, or %-EINVAL if @node is already linked.
+ */
+__bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ struct btf_struct_meta *meta,
+ u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)node;
+
+ return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+}
+
__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
struct bpf_list_node *node,
void *meta__ign, u64 off)
{
+ return bpf_list_push_front(head, node, meta__ign, off);
+}
+
+/**
+ * bpf_list_push_back() - add a node to the back of a BPF linked list
+ * @head: list head
+ * @node: node to insert
+ * @meta: verifier-supplied struct metadata
+ * @off: verifier-supplied offset of @node within the containing object
+ *
+ * Insert @node at the back of @head. The verifier rewrites @meta and @off;
+ * BPF programs do not set them.
+ *
+ * Return: 0 on success, or %-EINVAL if @node is already linked.
+ */
+__bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ struct btf_struct_meta *meta,
+ u64 off)
+{
struct bpf_list_node_kern *n = (void *)node;
- struct btf_struct_meta *meta = meta__ign;
- return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+ return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
}
__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
struct bpf_list_node *node,
void *meta__ign, u64 off)
{
- struct bpf_list_node_kern *n = (void *)node;
- struct btf_struct_meta *meta = meta__ign;
-
- return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
+ return bpf_list_push_back(head, node, meta__ign, off);
}
static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
@@ -2535,16 +2659,37 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
return 0;
}
-__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
- bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
- void *meta__ign, u64 off)
+/**
+ * bpf_rbtree_add() - add a node to a BPF rbtree
+ * @root: tree root
+ * @node: node to insert
+ * @less: comparator used to order nodes
+ * @meta: verifier-supplied struct metadata
+ * @off: verifier-supplied offset of @node within the containing object
+ *
+ * Insert @node into @root using @less. The verifier rewrites @meta and @off;
+ * BPF programs do not set them.
+ *
+ * Return: 0 on success, or %-EINVAL if @node is already linked in a tree.
+ */
+__bpf_kfunc int bpf_rbtree_add(struct bpf_rb_root *root,
+ struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ struct btf_struct_meta *meta,
+ u64 off)
{
- struct btf_struct_meta *meta = meta__ign;
struct bpf_rb_node_kern *n = (void *)node;
return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
}
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta__ign, u64 off)
+{
+ return bpf_rbtree_add(root, node, less, meta__ign, off);
+}
+
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
{
struct rb_root_cached *r = (struct rb_root_cached *)root;
@@ -4165,17 +4310,25 @@ static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
return refcount_inc_not_zero(&ctx->refcnt);
}
+static void bpf_task_work_destroy(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+
+ bpf_task_work_ctx_reset(ctx);
+ kfree_rcu(ctx, rcu);
+}
+
static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
{
if (!refcount_dec_and_test(&ctx->refcnt))
return;
- bpf_task_work_ctx_reset(ctx);
-
- /* bpf_mem_free expects migration to be disabled */
- migrate_disable();
- bpf_mem_free(&bpf_global_ma, ctx);
- migrate_enable();
+ if (irqs_disabled()) {
+ ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy);
+ irq_work_queue(&ctx->irq_work);
+ } else {
+ bpf_task_work_destroy(&ctx->irq_work);
+ }
}
static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
@@ -4229,7 +4382,7 @@ static void bpf_task_work_irq(struct irq_work *irq_work)
enum bpf_task_work_state state;
int err;
- guard(rcu_tasks_trace)();
+ guard(rcu)();
if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
bpf_task_work_ctx_put(ctx);
@@ -4251,9 +4404,9 @@ static void bpf_task_work_irq(struct irq_work *irq_work)
/*
* It's technically possible for just scheduled task_work callback to
* complete running by now, going SCHEDULING -> RUNNING and then
- * dropping its ctx refcount. Instead of capturing extra ref just to
- * protected below ctx->state access, we rely on RCU protection to
- * perform below SCHEDULING -> SCHEDULED attempt.
+ * dropping its ctx refcount. Instead of capturing an extra ref just
+ * to protect below ctx->state access, we rely on rcu_read_lock
+ * above to prevent kfree_rcu from freeing ctx before we return.
*/
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
if (state == BPF_TW_FREED)
@@ -4270,7 +4423,7 @@ static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *t
if (ctx)
return ctx;
- ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
+ ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE);
if (!ctx)
return ERR_PTR(-ENOMEM);
@@ -4284,7 +4437,7 @@ static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *t
* tw->ctx is set by concurrent BPF program, release allocated
* memory and try to reuse already set context.
*/
- bpf_mem_free(&bpf_global_ma, ctx);
+ kfree_nolock(ctx);
return old_ctx;
}
@@ -4296,13 +4449,23 @@ static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work
{
struct bpf_task_work_ctx *ctx;
- ctx = bpf_task_work_fetch_ctx(tw, map);
- if (IS_ERR(ctx))
- return ctx;
-
- /* try to get ref for task_work callback to hold */
- if (!bpf_task_work_ctx_tryget(ctx))
- return ERR_PTR(-EBUSY);
+ /*
+ * Sleepable BPF programs hold rcu_read_lock_trace but not
+ * regular rcu_read_lock. Since kfree_rcu waits for regular
+ * RCU GP, the ctx can be freed while we're between reading
+ * the pointer and incrementing the refcount. Take regular
+ * rcu_read_lock to prevent kfree_rcu from freeing the ctx
+ * before we can tryget it.
+ */
+ scoped_guard(rcu) {
+ ctx = bpf_task_work_fetch_ctx(tw, map);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ /* try to get ref for task_work callback to hold */
+ if (!bpf_task_work_ctx_tryget(ctx))
+ return ERR_PTR(-EBUSY);
+ }
if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
/* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
@@ -4417,7 +4580,7 @@ static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
return -EINVAL;
}
- state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl));
+ state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE);
if (!state) {
bpf_dynptr_set_null(ptr);
return -ENOMEM;
@@ -4449,7 +4612,7 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
return 0;
freader_cleanup(&df->freader);
- bpf_mem_free(&bpf_global_ma, df);
+ kfree_nolock(df);
bpf_dynptr_set_null(ptr);
return 0;
}
@@ -4536,12 +4699,19 @@ BTF_KFUNCS_START(generic_btf_ids)
#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
+BTF_ID_FLAGS(func, bpf_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire, KF_ACQUIRE | KF_RET_NULL | KF_RCU | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
+BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
@@ -4550,6 +4720,7 @@ BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_add, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
@@ -4578,6 +4749,9 @@ BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
#endif
#endif
+#ifdef CONFIG_S390
+BTF_ID_FLAGS(func, bpf_get_lowcore)
+#endif
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 998986853c61..332e6e003f27 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -2,217 +2,119 @@
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
#include <linux/hashtable.h>
#include <linux/jhash.h>
#include <linux/slab.h>
+#include <linux/sort.h>
-/*
- * This file implements live stack slots analysis. After accumulating
- * stack usage data, the analysis answers queries about whether a
- * particular stack slot may be read by an instruction or any of it's
- * successors. This data is consumed by the verifier states caching
- * mechanism to decide which stack slots are important when looking for a
- * visited state corresponding to the current state.
- *
- * The analysis is call chain sensitive, meaning that data is collected
- * and queried for tuples (call chain, subprogram instruction index).
- * Such sensitivity allows identifying if some subprogram call always
- * leads to writes in the caller's stack.
- *
- * The basic idea is as follows:
- * - As the verifier accumulates a set of visited states, the analysis instance
- * accumulates a conservative estimate of stack slots that can be read
- * or must be written for each visited tuple (call chain, instruction index).
- * - If several states happen to visit the same instruction with the same
- * call chain, stack usage information for the corresponding tuple is joined:
- * - "may_read" set represents a union of all possibly read slots
- * (any slot in "may_read" set might be read at or after the instruction);
- * - "must_write" set represents an intersection of all possibly written slots
- * (any slot in "must_write" set is guaranteed to be written by the instruction).
- * - The analysis is split into two phases:
- * - read and write marks accumulation;
- * - read and write marks propagation.
- * - The propagation phase is a textbook live variable data flow analysis:
- *
- * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
- * state[cc, i].live_before =
- * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
- *
- * Where:
- * - `U` stands for set union
- * - `/` stands for set difference;
- * - `cc` stands for a call chain;
- * - `i` and `s` are instruction indexes;
- *
- * The above equations are computed for each call chain and instruction
- * index until state stops changing.
- * - Additionally, in order to transfer "must_write" information from a
- * subprogram to call instructions invoking this subprogram,
- * the "must_write_acc" set is tracked for each (cc, i) tuple.
- * A set of stack slots that are guaranteed to be written by this
- * instruction or any of its successors (within the subprogram).
- * The equation for "must_write_acc" propagation looks as follows:
- *
- * state[cc, i].must_write_acc =
- * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
- * U state[cc, i].must_write
- *
- * (An intersection of all "must_write_acc" for instruction successors
- * plus all "must_write" slots for the instruction itself).
- * - After the propagation phase completes for a subprogram, information from
- * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain:
- * - "must_write_acc" set is intersected with the call site's "must_write" set;
- * - "may_read" set is added to the call site's "may_read" set.
- * - Any live stack queries must be taken after the propagation phase.
- * - Accumulation and propagation phases can be entered multiple times,
- * at any point in time:
- * - "may_read" set only grows;
- * - "must_write" set only shrinks;
- * - for each visited verifier state with zero branches, all relevant
- * read and write marks are already recorded by the analysis instance.
- *
- * Technically, the analysis is facilitated by the following data structures:
- * - Call chain: for given verifier state, the call chain is a tuple of call
- * instruction indexes leading to the current subprogram plus the subprogram
- * entry point index.
- * - Function instance: for a given call chain, for each instruction in
- * the current subprogram, a mapping between instruction index and a
- * set of "may_read", "must_write" and other marks accumulated for this
- * instruction.
- * - A hash table mapping call chains to function instances.
- */
-
-struct callchain {
- u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */
- /* cached subprog_info[*].start for functions owning the frames:
- * - sp_starts[curframe] used to get insn relative index within current function;
- * - sp_starts[0..current-1] used for fast callchain_frame_up().
- */
- u32 sp_starts[MAX_CALL_FRAMES];
- u32 curframe; /* depth of callsites and sp_starts arrays */
-};
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
struct per_frame_masks {
- u64 may_read; /* stack slots that may be read by this instruction */
- u64 must_write; /* stack slots written by this instruction */
- u64 must_write_acc; /* stack slots written by this instruction and its successors */
- u64 live_before; /* stack slots that may be read by this insn and its successors */
+ spis_t may_read; /* stack slots that may be read by this instruction */
+ spis_t must_write; /* stack slots written by this instruction */
+ spis_t live_before; /* stack slots that may be read by this insn and its successors */
};
/*
- * A function instance created for a specific callchain.
+ * A function instance keyed by (callsite, depth).
* Encapsulates read and write marks for each instruction in the function.
- * Marks are tracked for each frame in the callchain.
+ * Marks are tracked for each frame up to @depth.
*/
struct func_instance {
struct hlist_node hl_node;
- struct callchain callchain;
+ u32 callsite; /* call insn that invoked this subprog (subprog_start for depth 0) */
+ u32 depth; /* call depth (0 = entry subprog) */
+ u32 subprog; /* subprog index */
+ u32 subprog_start; /* cached env->subprog_info[subprog].start */
u32 insn_cnt; /* cached number of insns in the function */
- bool updated;
- bool must_write_dropped;
/* Per frame, per instruction masks, frames allocated lazily. */
struct per_frame_masks *frames[MAX_CALL_FRAMES];
- /* For each instruction a flag telling if "must_write" had been initialized for it. */
- bool *must_write_set;
+ bool must_write_initialized;
};
struct live_stack_query {
struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
+ u32 callsites[MAX_CALL_FRAMES]; /* callsite[i] = insn calling frame i+1 */
u32 curframe;
u32 insn_idx;
};
struct bpf_liveness {
- DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */
+ DECLARE_HASHTABLE(func_instances, 8); /* maps (depth, callsite) to func_instance */
struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */
- /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */
- struct func_instance *cur_instance;
- /*
- * Below fields are used to accumulate stack write marks for instruction at
- * @write_insn_idx before submitting the marks to @cur_instance.
- */
- u64 write_masks_acc[MAX_CALL_FRAMES];
- u32 write_insn_idx;
+ u32 subprog_calls; /* analyze_subprog() invocations */
};
-/* Compute callchain corresponding to state @st at depth @frameno */
-static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st,
- struct callchain *callchain, u32 frameno)
+/*
+ * Hash/compare key for func_instance: (depth, callsite).
+ * For depth == 0 (entry subprog), @callsite is the subprog start insn.
+ * For depth > 0, @callsite is the call instruction index that invoked the subprog.
+ */
+static u32 instance_hash(u32 callsite, u32 depth)
{
- struct bpf_subprog_info *subprog_info = env->subprog_info;
- u32 i;
+ u32 key[2] = { depth, callsite };
- memset(callchain, 0, sizeof(*callchain));
- for (i = 0; i <= frameno; i++) {
- callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start;
- if (i < st->curframe)
- callchain->callsites[i] = st->frame[i + 1]->callsite;
- }
- callchain->curframe = frameno;
- callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe];
+ return jhash2(key, 2, 0);
}
-static u32 hash_callchain(struct callchain *callchain)
+static struct func_instance *find_instance(struct bpf_verifier_env *env,
+ u32 callsite, u32 depth)
{
- return jhash2(callchain->callsites, callchain->curframe, 0);
-}
-
-static bool same_callsites(struct callchain *a, struct callchain *b)
-{
- int i;
+ struct bpf_liveness *liveness = env->liveness;
+ struct func_instance *f;
+ u32 key = instance_hash(callsite, depth);
- if (a->curframe != b->curframe)
- return false;
- for (i = a->curframe; i >= 0; i--)
- if (a->callsites[i] != b->callsites[i])
- return false;
- return true;
+ hash_for_each_possible(liveness->func_instances, f, hl_node, key)
+ if (f->depth == depth && f->callsite == callsite)
+ return f;
+ return NULL;
}
-/*
- * Find existing or allocate new function instance corresponding to @callchain.
- * Instances are accumulated in env->liveness->func_instances and persist
- * until the end of the verification process.
- */
-static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
- struct callchain *callchain)
+static struct func_instance *call_instance(struct bpf_verifier_env *env,
+ struct func_instance *caller,
+ u32 callsite, int subprog)
{
- struct bpf_liveness *liveness = env->liveness;
- struct bpf_subprog_info *subprog;
- struct func_instance *result;
- u32 subprog_sz, size, key;
-
- key = hash_callchain(callchain);
- hash_for_each_possible(liveness->func_instances, result, hl_node, key)
- if (same_callsites(&result->callchain, callchain))
- return result;
-
- subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]);
- subprog_sz = (subprog + 1)->start - subprog->start;
- size = sizeof(struct func_instance);
- result = kvzalloc(size, GFP_KERNEL_ACCOUNT);
- if (!result)
- return ERR_PTR(-ENOMEM);
- result->must_write_set = kvzalloc_objs(*result->must_write_set,
- subprog_sz, GFP_KERNEL_ACCOUNT);
- if (!result->must_write_set) {
- kvfree(result);
+ u32 depth = caller ? caller->depth + 1 : 0;
+ u32 subprog_start = env->subprog_info[subprog].start;
+ u32 lookup_key = depth > 0 ? callsite : subprog_start;
+ struct func_instance *f;
+ u32 hash;
+
+ f = find_instance(env, lookup_key, depth);
+ if (f)
+ return f;
+
+ f = kvzalloc(sizeof(*f), GFP_KERNEL_ACCOUNT);
+ if (!f)
return ERR_PTR(-ENOMEM);
- }
- memcpy(&result->callchain, callchain, sizeof(*callchain));
- result->insn_cnt = subprog_sz;
- hash_add(liveness->func_instances, &result->hl_node, key);
- return result;
+ f->callsite = lookup_key;
+ f->depth = depth;
+ f->subprog = subprog;
+ f->subprog_start = subprog_start;
+ f->insn_cnt = (env->subprog_info + subprog + 1)->start - subprog_start;
+ hash = instance_hash(lookup_key, depth);
+ hash_add(env->liveness->func_instances, &f->hl_node, hash);
+ return f;
}
static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
struct bpf_verifier_state *st,
u32 frameno)
{
- struct callchain callchain;
-
- compute_callchain(env, st, &callchain, frameno);
- return __lookup_instance(env, &callchain);
+ u32 callsite, subprog_start;
+ struct func_instance *f;
+ u32 key, depth;
+
+ subprog_start = env->subprog_info[st->frame[frameno]->subprogno].start;
+ callsite = frameno > 0 ? st->frame[frameno]->callsite : subprog_start;
+
+ for (depth = frameno; ; depth--) {
+ key = depth > 0 ? callsite : subprog_start;
+ f = find_instance(env, key, depth);
+ if (f || depth == 0)
+ return f;
+ }
}
int bpf_stack_liveness_init(struct bpf_verifier_env *env)
@@ -233,9 +135,8 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env)
if (!env->liveness)
return;
hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
- for (i = 0; i <= instance->callchain.curframe; i++)
+ for (i = 0; i <= instance->depth; i++)
kvfree(instance->frames[i]);
- kvfree(instance->must_write_set);
kvfree(instance);
}
kvfree(env->liveness);
@@ -247,7 +148,7 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env)
*/
static int relative_idx(struct func_instance *instance, u32 insn_idx)
{
- return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe];
+ return insn_idx - instance->subprog_start;
}
static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
@@ -259,8 +160,7 @@ static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
return &instance->frames[frame][relative_idx(instance, insn_idx)];
}
-static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
- struct func_instance *instance,
+static struct per_frame_masks *alloc_frame_masks(struct func_instance *instance,
u32 frame, u32 insn_idx)
{
struct per_frame_masks *arr;
@@ -275,167 +175,29 @@ static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
return get_frame_masks(instance, frame, insn_idx);
}
-void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env)
-{
- env->liveness->cur_instance = NULL;
-}
-
-/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */
-static int ensure_cur_instance(struct bpf_verifier_env *env)
-{
- struct bpf_liveness *liveness = env->liveness;
- struct func_instance *instance;
-
- if (liveness->cur_instance)
- return 0;
-
- instance = lookup_instance(env, env->cur_state, env->cur_state->curframe);
- if (IS_ERR(instance))
- return PTR_ERR(instance);
-
- liveness->cur_instance = instance;
- return 0;
-}
-
/* Accumulate may_read masks for @frame at @insn_idx */
-static int mark_stack_read(struct bpf_verifier_env *env,
- struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask)
+static int mark_stack_read(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask)
{
struct per_frame_masks *masks;
- u64 new_may_read;
- masks = alloc_frame_masks(env, instance, frame, insn_idx);
+ masks = alloc_frame_masks(instance, frame, insn_idx);
if (IS_ERR(masks))
return PTR_ERR(masks);
- new_may_read = masks->may_read | mask;
- if (new_may_read != masks->may_read &&
- ((new_may_read | masks->live_before) != masks->live_before))
- instance->updated = true;
- masks->may_read |= mask;
- return 0;
-}
-
-int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask)
-{
- int err;
-
- err = ensure_cur_instance(env);
- err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask);
- return err;
-}
-
-static void reset_stack_write_marks(struct bpf_verifier_env *env,
- struct func_instance *instance, u32 insn_idx)
-{
- struct bpf_liveness *liveness = env->liveness;
- int i;
-
- liveness->write_insn_idx = insn_idx;
- for (i = 0; i <= instance->callchain.curframe; i++)
- liveness->write_masks_acc[i] = 0;
-}
-
-int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx)
-{
- struct bpf_liveness *liveness = env->liveness;
- int err;
-
- err = ensure_cur_instance(env);
- if (err)
- return err;
-
- reset_stack_write_marks(env, liveness->cur_instance, insn_idx);
+ masks->may_read = spis_or(masks->may_read, mask);
return 0;
}
-void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask)
-{
- env->liveness->write_masks_acc[frame] |= mask;
-}
-
-static int commit_stack_write_marks(struct bpf_verifier_env *env,
- struct func_instance *instance)
+static int mark_stack_write(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask)
{
- struct bpf_liveness *liveness = env->liveness;
- u32 idx, frame, curframe, old_must_write;
struct per_frame_masks *masks;
- u64 mask;
-
- if (!instance)
- return 0;
- curframe = instance->callchain.curframe;
- idx = relative_idx(instance, liveness->write_insn_idx);
- for (frame = 0; frame <= curframe; frame++) {
- mask = liveness->write_masks_acc[frame];
- /* avoid allocating frames for zero masks */
- if (mask == 0 && !instance->must_write_set[idx])
- continue;
- masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx);
- if (IS_ERR(masks))
- return PTR_ERR(masks);
- old_must_write = masks->must_write;
- /*
- * If instruction at this callchain is seen for a first time, set must_write equal
- * to @mask. Otherwise take intersection with the previous value.
- */
- if (instance->must_write_set[idx])
- mask &= old_must_write;
- if (old_must_write != mask) {
- masks->must_write = mask;
- instance->updated = true;
- }
- if (old_must_write & ~mask)
- instance->must_write_dropped = true;
- }
- instance->must_write_set[idx] = true;
- liveness->write_insn_idx = 0;
+ masks = alloc_frame_masks(instance, frame, insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ masks->must_write = spis_or(masks->must_write, mask);
return 0;
}
-/*
- * Merge stack writes marks in @env->liveness->write_masks_acc
- * with information already in @env->liveness->cur_instance.
- */
-int bpf_commit_stack_write_marks(struct bpf_verifier_env *env)
-{
- return commit_stack_write_marks(env, env->liveness->cur_instance);
-}
-
-static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain)
-{
- char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf);
- char *buf = env->tmp_str_buf;
- int i;
-
- buf += snprintf(buf, buf_end - buf, "(");
- for (i = 0; i <= callchain->curframe; i++)
- buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]);
- snprintf(buf, buf_end - buf, ")");
- return env->tmp_str_buf;
-}
-
-static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain,
- char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new)
-{
- u64 changed_bits = old ^ new;
- u64 new_ones = new & changed_bits;
- u64 new_zeros = ~new & changed_bits;
-
- if (!changed_bits)
- return;
- bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx);
- if (new_ones) {
- bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones);
- bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf);
- }
- if (new_zeros) {
- bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros);
- bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf);
- }
- bpf_log(&env->log, "\n");
-}
-
int bpf_jmp_offset(struct bpf_insn *insn)
{
u8 code = insn->code;
@@ -507,62 +269,11 @@ bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
__diag_pop();
-static struct func_instance *get_outer_instance(struct bpf_verifier_env *env,
- struct func_instance *instance)
-{
- struct callchain callchain = instance->callchain;
-
- /* Adjust @callchain to represent callchain one frame up */
- callchain.callsites[callchain.curframe] = 0;
- callchain.sp_starts[callchain.curframe] = 0;
- callchain.curframe--;
- callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe];
- return __lookup_instance(env, &callchain);
-}
-
-static u32 callchain_subprog_start(struct callchain *callchain)
-{
- return callchain->sp_starts[callchain->curframe];
-}
-
-/*
- * Transfer @may_read and @must_write_acc marks from the first instruction of @instance,
- * to the call instruction in function instance calling @instance.
- */
-static int propagate_to_outer_instance(struct bpf_verifier_env *env,
- struct func_instance *instance)
-{
- struct callchain *callchain = &instance->callchain;
- u32 this_subprog_start, callsite, frame;
- struct func_instance *outer_instance;
- struct per_frame_masks *insn;
- int err;
-
- this_subprog_start = callchain_subprog_start(callchain);
- outer_instance = get_outer_instance(env, instance);
- if (IS_ERR(outer_instance))
- return PTR_ERR(outer_instance);
- callsite = callchain->callsites[callchain->curframe - 1];
-
- reset_stack_write_marks(env, outer_instance, callsite);
- for (frame = 0; frame < callchain->curframe; frame++) {
- insn = get_frame_masks(instance, frame, this_subprog_start);
- if (!insn)
- continue;
- bpf_mark_stack_write(env, frame, insn->must_write_acc);
- err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before);
- if (err)
- return err;
- }
- commit_stack_write_marks(env, outer_instance);
- return 0;
-}
static inline bool update_insn(struct bpf_verifier_env *env,
struct func_instance *instance, u32 frame, u32 insn_idx)
{
- struct bpf_insn_aux_data *aux = env->insn_aux_data;
- u64 new_before, new_after, must_write_acc;
+ spis_t new_before, new_after;
struct per_frame_masks *insn, *succ_insn;
struct bpf_iarray *succ;
u32 s;
@@ -574,77 +285,40 @@ static inline bool update_insn(struct bpf_verifier_env *env,
changed = false;
insn = get_frame_masks(instance, frame, insn_idx);
- new_before = 0;
- new_after = 0;
- /*
- * New "must_write_acc" is an intersection of all "must_write_acc"
- * of successors plus all "must_write" slots of instruction itself.
- */
- must_write_acc = U64_MAX;
+ new_before = SPIS_ZERO;
+ new_after = SPIS_ZERO;
for (s = 0; s < succ->cnt; ++s) {
succ_insn = get_frame_masks(instance, frame, succ->items[s]);
- new_after |= succ_insn->live_before;
- must_write_acc &= succ_insn->must_write_acc;
+ new_after = spis_or(new_after, succ_insn->live_before);
}
- must_write_acc |= insn->must_write;
/*
* New "live_before" is a union of all "live_before" of successors
* minus slots written by instruction plus slots read by instruction.
+ * new_before = (new_after & ~insn->must_write) | insn->may_read
*/
- new_before = (new_after & ~insn->must_write) | insn->may_read;
- changed |= new_before != insn->live_before;
- changed |= must_write_acc != insn->must_write_acc;
- if (unlikely(env->log.level & BPF_LOG_LEVEL2) &&
- (insn->may_read || insn->must_write ||
- insn_idx == callchain_subprog_start(&instance->callchain) ||
- aux[insn_idx].prune_point)) {
- log_mask_change(env, &instance->callchain, "live",
- frame, insn_idx, insn->live_before, new_before);
- log_mask_change(env, &instance->callchain, "written",
- frame, insn_idx, insn->must_write_acc, must_write_acc);
- }
+ new_before = spis_or(spis_and(new_after, spis_not(insn->must_write)),
+ insn->may_read);
+ changed |= !spis_equal(new_before, insn->live_before);
insn->live_before = new_before;
- insn->must_write_acc = must_write_acc;
return changed;
}
-/* Fixed-point computation of @live_before and @must_write_acc marks */
-static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+/* Fixed-point computation of @live_before marks */
+static void update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
{
- u32 i, frame, po_start, po_end, cnt, this_subprog_start;
- struct callchain *callchain = &instance->callchain;
+ u32 i, frame, po_start, po_end;
int *insn_postorder = env->cfg.insn_postorder;
struct bpf_subprog_info *subprog;
- struct per_frame_masks *insn;
bool changed;
- int err;
- this_subprog_start = callchain_subprog_start(callchain);
- /*
- * If must_write marks were updated must_write_acc needs to be reset
- * (to account for the case when new must_write sets became smaller).
- */
- if (instance->must_write_dropped) {
- for (frame = 0; frame <= callchain->curframe; frame++) {
- if (!instance->frames[frame])
- continue;
-
- for (i = 0; i < instance->insn_cnt; i++) {
- insn = get_frame_masks(instance, frame, this_subprog_start + i);
- insn->must_write_acc = 0;
- }
- }
- }
-
- subprog = bpf_find_containing_subprog(env, this_subprog_start);
+ instance->must_write_initialized = true;
+ subprog = &env->subprog_info[instance->subprog];
po_start = subprog->postorder_start;
po_end = (subprog + 1)->postorder_start;
- cnt = 0;
/* repeat until fixed point is reached */
do {
- cnt++;
changed = false;
- for (frame = 0; frame <= instance->callchain.curframe; frame++) {
+ for (frame = 0; frame <= instance->depth; frame++) {
if (!instance->frames[frame])
continue;
@@ -652,57 +326,14 @@ static int update_instance(struct bpf_verifier_env *env, struct func_instance *i
changed |= update_insn(env, instance, frame, insn_postorder[i]);
}
} while (changed);
-
- if (env->log.level & BPF_LOG_LEVEL2)
- bpf_log(&env->log, "%s live stack update done in %d iterations\n",
- fmt_callchain(env, callchain), cnt);
-
- /* transfer marks accumulated for outer frames to outer func instance (caller) */
- if (callchain->curframe > 0) {
- err = propagate_to_outer_instance(env, instance);
- if (err)
- return err;
- }
-
- return 0;
-}
-
-/*
- * Prepare all callchains within @env->cur_state for querying.
- * This function should be called after each verifier.c:pop_stack()
- * and whenever verifier.c:do_check_insn() processes subprogram exit.
- * This would guarantee that visited verifier states with zero branches
- * have their bpf_mark_stack_{read,write}() effects propagated in
- * @env->liveness.
- */
-int bpf_update_live_stack(struct bpf_verifier_env *env)
-{
- struct func_instance *instance;
- int err, frame;
-
- bpf_reset_live_stack_callchain(env);
- for (frame = env->cur_state->curframe; frame >= 0; --frame) {
- instance = lookup_instance(env, env->cur_state, frame);
- if (IS_ERR(instance))
- return PTR_ERR(instance);
-
- if (instance->updated) {
- err = update_instance(env, instance);
- if (err)
- return err;
- instance->updated = false;
- instance->must_write_dropped = false;
- }
- }
- return 0;
}
-static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi)
+static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 half_spi)
{
struct per_frame_masks *masks;
masks = get_frame_masks(instance, frameno, insn_idx);
- return masks && (masks->live_before & BIT(spi));
+ return masks && spis_test_bit(masks->live_before, half_spi);
}
int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
@@ -714,41 +345,1868 @@ int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_
memset(q, 0, sizeof(*q));
for (frame = 0; frame <= st->curframe; frame++) {
instance = lookup_instance(env, st, frame);
- if (IS_ERR(instance))
- return PTR_ERR(instance);
- q->instances[frame] = instance;
+ if (IS_ERR_OR_NULL(instance))
+ q->instances[frame] = NULL;
+ else
+ q->instances[frame] = instance;
+ if (frame < st->curframe)
+ q->callsites[frame] = st->frame[frame + 1]->callsite;
}
q->curframe = st->curframe;
q->insn_idx = st->insn_idx;
return 0;
}
-bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi)
+bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 half_spi)
{
/*
- * Slot is alive if it is read before q->st->insn_idx in current func instance,
+ * Slot is alive if it is read before q->insn_idx in current func instance,
* or if for some outer func instance:
* - alive before callsite if callsite calls callback, otherwise
* - alive after callsite
*/
struct live_stack_query *q = &env->liveness->live_stack_query;
struct func_instance *instance, *curframe_instance;
- u32 i, callsite;
- bool alive;
+ u32 i, callsite, rel;
+ int cur_delta, delta;
+ bool alive = false;
curframe_instance = q->instances[q->curframe];
- if (is_live_before(curframe_instance, q->insn_idx, frameno, spi))
+ if (!curframe_instance)
+ return true;
+ cur_delta = (int)curframe_instance->depth - (int)q->curframe;
+ rel = frameno + cur_delta;
+ if (rel <= curframe_instance->depth)
+ alive = is_live_before(curframe_instance, q->insn_idx, rel, half_spi);
+
+ if (alive)
return true;
for (i = frameno; i < q->curframe; i++) {
- callsite = curframe_instance->callchain.callsites[i];
instance = q->instances[i];
+ if (!instance)
+ return true;
+ /* Map actual frameno to frame index within this instance */
+ delta = (int)instance->depth - (int)i;
+ rel = frameno + delta;
+ if (rel > instance->depth)
+ return true;
+
+ /* Get callsite from verifier state, not from instance callchain */
+ callsite = q->callsites[i];
+
alive = bpf_calls_callback(env, callsite)
- ? is_live_before(instance, callsite, frameno, spi)
- : is_live_before(instance, callsite + 1, frameno, spi);
+ ? is_live_before(instance, callsite, rel, half_spi)
+ : is_live_before(instance, callsite + 1, rel, half_spi);
if (alive)
return true;
}
return false;
}
+
+static char *fmt_subprog(struct bpf_verifier_env *env, int subprog)
+{
+ const char *name = env->subprog_info[subprog].name;
+
+ snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf),
+ "subprog#%d%s%s", subprog, name ? " " : "", name ? name : "");
+ return env->tmp_str_buf;
+}
+
+static char *fmt_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+ snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf),
+ "(d%d,cs%d)", instance->depth, instance->callsite);
+ return env->tmp_str_buf;
+}
+
+static int spi_off(int spi)
+{
+ return -(spi + 1) * BPF_REG_SIZE;
+}
+
+/*
+ * When both halves of an 8-byte SPI are set, print as "-8","-16",...
+ * When only one half is set, print as "-4h","-8h",...
+ * Runs of 3+ consecutive fully-set SPIs are collapsed: "fp0-8..-24"
+ */
+static char *fmt_spis_mask(struct bpf_verifier_env *env, int frame, bool first, spis_t spis)
+{
+ int buf_sz = sizeof(env->tmp_str_buf);
+ char *buf = env->tmp_str_buf;
+ int spi, n, run_start;
+
+ buf[0] = '\0';
+
+ for (spi = 0; spi < STACK_SLOTS / 2 && buf_sz > 0; spi++) {
+ bool lo = spis_test_bit(spis, spi * 2);
+ bool hi = spis_test_bit(spis, spi * 2 + 1);
+ const char *space = first ? "" : " ";
+
+ if (!lo && !hi)
+ continue;
+
+ if (!lo || !hi) {
+ /* half-spi */
+ n = scnprintf(buf, buf_sz, "%sfp%d%d%s",
+ space, frame, spi_off(spi) + (lo ? STACK_SLOT_SZ : 0), "h");
+ } else if (spi + 2 < STACK_SLOTS / 2 &&
+ spis_test_bit(spis, spi * 2 + 2) &&
+ spis_test_bit(spis, spi * 2 + 3) &&
+ spis_test_bit(spis, spi * 2 + 4) &&
+ spis_test_bit(spis, spi * 2 + 5)) {
+ /* 3+ consecutive full spis */
+ run_start = spi;
+ while (spi + 1 < STACK_SLOTS / 2 &&
+ spis_test_bit(spis, (spi + 1) * 2) &&
+ spis_test_bit(spis, (spi + 1) * 2 + 1))
+ spi++;
+ n = scnprintf(buf, buf_sz, "%sfp%d%d..%d",
+ space, frame, spi_off(run_start), spi_off(spi));
+ } else {
+ /* just a full spi */
+ n = scnprintf(buf, buf_sz, "%sfp%d%d", space, frame, spi_off(spi));
+ }
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ }
+ return env->tmp_str_buf;
+}
+
+static void print_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+ int start = env->subprog_info[instance->subprog].start;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct per_frame_masks *masks;
+ int len = instance->insn_cnt;
+ int insn_idx, frame, i;
+ bool has_use, has_def;
+ u64 pos, insn_pos;
+
+ if (!(env->log.level & BPF_LOG_LEVEL2))
+ return;
+
+ verbose(env, "stack use/def %s ", fmt_subprog(env, instance->subprog));
+ verbose(env, "%s:\n", fmt_instance(env, instance));
+ for (i = 0; i < len; i++) {
+ insn_idx = start + i;
+ has_use = false;
+ has_def = false;
+ pos = env->log.end_pos;
+ verbose(env, "%3d: ", insn_idx);
+ bpf_verbose_insn(env, &insns[insn_idx]);
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1); /* remove \n */
+ insn_pos = env->log.end_pos;
+ verbose(env, "%*c;", bpf_vlog_alignment(insn_pos - pos), ' ');
+ pos = env->log.end_pos;
+ verbose(env, " use: ");
+ for (frame = instance->depth; frame >= 0; --frame) {
+ masks = get_frame_masks(instance, frame, insn_idx);
+ if (!masks || spis_is_zero(masks->may_read))
+ continue;
+ verbose(env, "%s", fmt_spis_mask(env, frame, !has_use, masks->may_read));
+ has_use = true;
+ }
+ if (!has_use)
+ bpf_vlog_reset(&env->log, pos);
+ pos = env->log.end_pos;
+ verbose(env, " def: ");
+ for (frame = instance->depth; frame >= 0; --frame) {
+ masks = get_frame_masks(instance, frame, insn_idx);
+ if (!masks || spis_is_zero(masks->must_write))
+ continue;
+ verbose(env, "%s", fmt_spis_mask(env, frame, !has_def, masks->must_write));
+ has_def = true;
+ }
+ if (!has_def)
+ bpf_vlog_reset(&env->log, has_use ? pos : insn_pos);
+ verbose(env, "\n");
+ if (bpf_is_ldimm64(&insns[insn_idx]))
+ i++;
+ }
+}
+
+static int cmp_instances(const void *pa, const void *pb)
+{
+ struct func_instance *a = *(struct func_instance **)pa;
+ struct func_instance *b = *(struct func_instance **)pb;
+ int dcallsite = (int)a->callsite - b->callsite;
+ int ddepth = (int)a->depth - b->depth;
+
+ if (dcallsite)
+ return dcallsite;
+ if (ddepth)
+ return ddepth;
+ return 0;
+}
+
+/* print use/def slots for all instances ordered by callsite first, then by depth */
+static int print_instances(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance, **sorted_instances;
+ struct bpf_liveness *liveness = env->liveness;
+ int i, bkt, cnt;
+
+ cnt = 0;
+ hash_for_each(liveness->func_instances, bkt, instance, hl_node)
+ cnt++;
+ sorted_instances = kvmalloc_objs(*sorted_instances, cnt, GFP_KERNEL_ACCOUNT);
+ if (!sorted_instances)
+ return -ENOMEM;
+ cnt = 0;
+ hash_for_each(liveness->func_instances, bkt, instance, hl_node)
+ sorted_instances[cnt++] = instance;
+ sort(sorted_instances, cnt, sizeof(*sorted_instances), cmp_instances, NULL);
+ for (i = 0; i < cnt; i++)
+ print_instance(env, sorted_instances[i]);
+ kvfree(sorted_instances);
+ return 0;
+}
+
+/*
+ * Per-register tracking state for compute_subprog_args().
+ * Tracks which frame's FP a value is derived from
+ * and the byte offset from that frame's FP.
+ *
+ * The .frame field forms a lattice with three levels of precision:
+ *
+ * precise {frame=N, off=V} -- known absolute frame index and byte offset
+ * |
+ * offset-imprecise {frame=N, cnt=0}
+ * | -- known frame identity, unknown offset
+ * fully-imprecise {frame=ARG_IMPRECISE, mask=bitmask}
+ * -- unknown frame identity; .mask is a
+ * bitmask of which frame indices might be
+ * involved
+ *
+ * At CFG merge points, arg_track_join() moves down the lattice:
+ * - same frame + same offset -> precise
+ * - same frame + different offset -> offset-imprecise
+ * - different frames -> fully-imprecise (bitmask OR)
+ *
+ * At memory access sites (LDX/STX/ST), offset-imprecise marks only
+ * the known frame's access mask as SPIS_ALL, while fully-imprecise
+ * iterates bits in the bitmask and routes each frame to its target.
+ */
+#define MAX_ARG_OFFSETS 4
+
+struct arg_track {
+ union {
+ s16 off[MAX_ARG_OFFSETS]; /* byte offsets; off_cnt says how many */
+ u16 mask; /* arg bitmask when arg == ARG_IMPRECISE */
+ };
+ s8 frame; /* absolute frame index, or enum arg_track_state */
+ s8 off_cnt; /* 0 = offset-imprecise, 1-4 = # of precise offsets */
+};
+
+enum arg_track_state {
+ ARG_NONE = -1, /* not derived from any argument */
+ ARG_UNVISITED = -2, /* not yet reached by dataflow */
+ ARG_IMPRECISE = -3, /* lost identity; .mask is arg bitmask */
+};
+
+/* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */
+#define MAX_ARG_SPILL_SLOTS 64
+
+static bool arg_is_visited(const struct arg_track *at)
+{
+ return at->frame != ARG_UNVISITED;
+}
+
+static bool arg_is_fp(const struct arg_track *at)
+{
+ return at->frame >= 0 || at->frame == ARG_IMPRECISE;
+}
+
+static void verbose_arg_track(struct bpf_verifier_env *env, struct arg_track *at)
+{
+ int i;
+
+ switch (at->frame) {
+ case ARG_NONE: verbose(env, "_"); break;
+ case ARG_UNVISITED: verbose(env, "?"); break;
+ case ARG_IMPRECISE: verbose(env, "IMP%x", at->mask); break;
+ default:
+ /* frame >= 0: absolute frame index */
+ if (at->off_cnt == 0) {
+ verbose(env, "fp%d ?", at->frame);
+ } else {
+ for (i = 0; i < at->off_cnt; i++) {
+ if (i)
+ verbose(env, "|");
+ verbose(env, "fp%d%+d", at->frame, at->off[i]);
+ }
+ }
+ break;
+ }
+}
+
+static bool arg_track_eq(const struct arg_track *a, const struct arg_track *b)
+{
+ int i;
+
+ if (a->frame != b->frame)
+ return false;
+ if (a->frame == ARG_IMPRECISE)
+ return a->mask == b->mask;
+ if (a->frame < 0)
+ return true;
+ if (a->off_cnt != b->off_cnt)
+ return false;
+ for (i = 0; i < a->off_cnt; i++)
+ if (a->off[i] != b->off[i])
+ return false;
+ return true;
+}
+
+static struct arg_track arg_single(s8 arg, s16 off)
+{
+ struct arg_track at = {};
+
+ at.frame = arg;
+ at.off[0] = off;
+ at.off_cnt = 1;
+ return at;
+}
+
+/*
+ * Merge two sorted offset arrays, deduplicate.
+ * Returns off_cnt=0 if the result exceeds MAX_ARG_OFFSETS.
+ * Both args must have the same frame and off_cnt > 0.
+ */
+static struct arg_track arg_merge_offsets(struct arg_track a, struct arg_track b)
+{
+ struct arg_track result = { .frame = a.frame };
+ struct arg_track imp = { .frame = a.frame };
+ int i = 0, j = 0, k = 0;
+
+ while (i < a.off_cnt && j < b.off_cnt) {
+ s16 v;
+
+ if (a.off[i] <= b.off[j]) {
+ v = a.off[i++];
+ if (v == b.off[j])
+ j++;
+ } else {
+ v = b.off[j++];
+ }
+ if (k > 0 && result.off[k - 1] == v)
+ continue;
+ if (k >= MAX_ARG_OFFSETS)
+ return imp;
+ result.off[k++] = v;
+ }
+ while (i < a.off_cnt) {
+ if (k >= MAX_ARG_OFFSETS)
+ return imp;
+ result.off[k++] = a.off[i++];
+ }
+ while (j < b.off_cnt) {
+ if (k >= MAX_ARG_OFFSETS)
+ return imp;
+ result.off[k++] = b.off[j++];
+ }
+ result.off_cnt = k;
+ return result;
+}
+
+/*
+ * Merge two arg_tracks into ARG_IMPRECISE, collecting the frame
+ * bits from both operands. Precise frame indices (frame >= 0)
+ * contribute a single bit; existing ARG_IMPRECISE values
+ * contribute their full bitmask.
+ */
+static struct arg_track arg_join_imprecise(struct arg_track a, struct arg_track b)
+{
+ u32 m = 0;
+
+ if (a.frame >= 0)
+ m |= BIT(a.frame);
+ else if (a.frame == ARG_IMPRECISE)
+ m |= a.mask;
+
+ if (b.frame >= 0)
+ m |= BIT(b.frame);
+ else if (b.frame == ARG_IMPRECISE)
+ m |= b.mask;
+
+ return (struct arg_track){ .mask = m, .frame = ARG_IMPRECISE };
+}
+
+/* Join two arg_track values at merge points */
+static struct arg_track __arg_track_join(struct arg_track a, struct arg_track b)
+{
+ if (!arg_is_visited(&b))
+ return a;
+ if (!arg_is_visited(&a))
+ return b;
+ if (a.frame == b.frame && a.frame >= 0) {
+ /* Both offset-imprecise: stay imprecise */
+ if (a.off_cnt == 0 || b.off_cnt == 0)
+ return (struct arg_track){ .frame = a.frame };
+ /* Merge offset sets; falls back to off_cnt=0 if >4 */
+ return arg_merge_offsets(a, b);
+ }
+
+ /*
+ * args are different, but one of them is known
+ * arg + none -> arg
+ * none + arg -> arg
+ *
+ * none + none -> none
+ */
+ if (a.frame == ARG_NONE && b.frame == ARG_NONE)
+ return a;
+ if (a.frame >= 0 && b.frame == ARG_NONE) {
+ /*
+ * When joining single fp-N add fake fp+0 to
+ * keep stack_use and prevent stack_def
+ */
+ if (a.off_cnt == 1)
+ return arg_merge_offsets(a, arg_single(a.frame, 0));
+ return a;
+ }
+ if (b.frame >= 0 && a.frame == ARG_NONE) {
+ if (b.off_cnt == 1)
+ return arg_merge_offsets(b, arg_single(b.frame, 0));
+ return b;
+ }
+
+ return arg_join_imprecise(a, b);
+}
+
+static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, int r,
+ struct arg_track *in, struct arg_track out)
+{
+ struct arg_track old = *in;
+ struct arg_track new_val = __arg_track_join(old, out);
+
+ if (arg_track_eq(&new_val, &old))
+ return false;
+
+ *in = new_val;
+ if (!(env->log.level & BPF_LOG_LEVEL2) || !arg_is_visited(&old))
+ return true;
+
+ verbose(env, "arg JOIN insn %d -> %d ", idx, target);
+ if (r >= 0)
+ verbose(env, "r%d: ", r);
+ else
+ verbose(env, "fp%+d: ", r * 8);
+ verbose_arg_track(env, &old);
+ verbose(env, " + ");
+ verbose_arg_track(env, &out);
+ verbose(env, " => ");
+ verbose_arg_track(env, &new_val);
+ verbose(env, "\n");
+ return true;
+}
+
+/*
+ * Compute the result when an ALU op destroys offset precision.
+ * If a single arg is identifiable, preserve it with OFF_IMPRECISE.
+ * If two different args are involved or one is already ARG_IMPRECISE,
+ * the result is fully ARG_IMPRECISE.
+ */
+static void arg_track_alu64(struct arg_track *dst, const struct arg_track *src)
+{
+ WARN_ON_ONCE(!arg_is_visited(dst));
+ WARN_ON_ONCE(!arg_is_visited(src));
+
+ if (dst->frame >= 0 && (src->frame == ARG_NONE || src->frame == dst->frame)) {
+ /*
+ * rX += rY where rY is not arg derived
+ * rX += rX
+ */
+ dst->off_cnt = 0;
+ return;
+ }
+ if (src->frame >= 0 && dst->frame == ARG_NONE) {
+ /*
+ * rX += rY where rX is not arg derived
+ * rY identity leaks into rX
+ */
+ dst->off_cnt = 0;
+ dst->frame = src->frame;
+ return;
+ }
+
+ if (dst->frame == ARG_NONE && src->frame == ARG_NONE)
+ return;
+
+ *dst = arg_join_imprecise(*dst, *src);
+}
+
+static bool arg_add(s16 off, s64 delta, s16 *out)
+{
+ s16 d = delta;
+
+ if (d != delta)
+ return true;
+ return check_add_overflow(off, d, out);
+}
+
+static void arg_padd(struct arg_track *at, s64 delta)
+{
+ int i;
+
+ if (at->off_cnt == 0)
+ return;
+ for (i = 0; i < at->off_cnt; i++) {
+ s16 new_off;
+
+ if (arg_add(at->off[i], delta, &new_off)) {
+ at->off_cnt = 0;
+ return;
+ }
+ at->off[i] = new_off;
+ }
+}
+
+/*
+ * Convert a byte offset from FP to a callee stack slot index.
+ * Returns -1 if out of range or not 8-byte aligned.
+ * Slot 0 = fp-8, slot 1 = fp-16, ..., slot 7 = fp-64, ....
+ */
+static int fp_off_to_slot(s16 off)
+{
+ if (off >= 0 || off < -(int)(MAX_ARG_SPILL_SLOTS * 8))
+ return -1;
+ if (off % 8)
+ return -1;
+ return (-off) / 8 - 1;
+}
+
+static struct arg_track fill_from_stack(struct bpf_insn *insn,
+ struct arg_track *at_out, int reg,
+ struct arg_track *at_stack_out,
+ int depth)
+{
+ struct arg_track imp = {
+ .mask = (1u << (depth + 1)) - 1,
+ .frame = ARG_IMPRECISE
+ };
+ struct arg_track result = { .frame = ARG_NONE };
+ int cnt, i;
+
+ if (reg == BPF_REG_FP) {
+ int slot = fp_off_to_slot(insn->off);
+
+ return slot >= 0 ? at_stack_out[slot] : imp;
+ }
+ cnt = at_out[reg].off_cnt;
+ if (cnt == 0)
+ return imp;
+
+ for (i = 0; i < cnt; i++) {
+ s16 fp_off, slot;
+
+ if (arg_add(at_out[reg].off[i], insn->off, &fp_off))
+ return imp;
+ slot = fp_off_to_slot(fp_off);
+ if (slot < 0)
+ return imp;
+ result = __arg_track_join(result, at_stack_out[slot]);
+ }
+ return result;
+}
+
+/*
+ * Spill @val to all possible stack slots indicated by the FP offsets in @reg.
+ * For an 8-byte store, single candidate slot gets @val. multi-slots are joined.
+ * sub-8-byte store joins with ARG_NONE.
+ * When exact offset is unknown conservatively add reg values to all slots in at_stack_out.
+ */
+static void spill_to_stack(struct bpf_insn *insn, struct arg_track *at_out,
+ int reg, struct arg_track *at_stack_out,
+ struct arg_track *val, u32 sz)
+{
+ struct arg_track none = { .frame = ARG_NONE };
+ struct arg_track new_val = sz == 8 ? *val : none;
+ int cnt, i;
+
+ if (reg == BPF_REG_FP) {
+ int slot = fp_off_to_slot(insn->off);
+
+ if (slot >= 0)
+ at_stack_out[slot] = new_val;
+ return;
+ }
+ cnt = at_out[reg].off_cnt;
+ if (cnt == 0) {
+ for (int slot = 0; slot < MAX_ARG_SPILL_SLOTS; slot++)
+ at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val);
+ return;
+ }
+ for (i = 0; i < cnt; i++) {
+ s16 fp_off;
+ int slot;
+
+ if (arg_add(at_out[reg].off[i], insn->off, &fp_off))
+ continue;
+ slot = fp_off_to_slot(fp_off);
+ if (slot < 0)
+ continue;
+ if (cnt == 1)
+ at_stack_out[slot] = new_val;
+ else
+ at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val);
+ }
+}
+
+/*
+ * Clear all tracked callee stack slots overlapping the byte range
+ * [off, off+sz-1] where off is a negative FP-relative offset.
+ */
+static void clear_overlapping_stack_slots(struct arg_track *at_stack, s16 off, u32 sz, int cnt)
+{
+ struct arg_track none = { .frame = ARG_NONE };
+
+ if (cnt == 0) {
+ for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++)
+ at_stack[i] = __arg_track_join(at_stack[i], none);
+ return;
+ }
+ for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
+ int slot_start = -((i + 1) * 8);
+ int slot_end = slot_start + 8;
+
+ if (slot_start < off + (int)sz && slot_end > off) {
+ if (cnt == 1)
+ at_stack[i] = none;
+ else
+ at_stack[i] = __arg_track_join(at_stack[i], none);
+ }
+ }
+}
+
+/*
+ * Clear stack slots overlapping all possible FP offsets in @reg.
+ */
+static void clear_stack_for_all_offs(struct bpf_insn *insn,
+ struct arg_track *at_out, int reg,
+ struct arg_track *at_stack_out, u32 sz)
+{
+ int cnt, i;
+
+ if (reg == BPF_REG_FP) {
+ clear_overlapping_stack_slots(at_stack_out, insn->off, sz, 1);
+ return;
+ }
+ cnt = at_out[reg].off_cnt;
+ if (cnt == 0) {
+ clear_overlapping_stack_slots(at_stack_out, 0, sz, cnt);
+ return;
+ }
+ for (i = 0; i < cnt; i++) {
+ s16 fp_off;
+
+ if (arg_add(at_out[reg].off[i], insn->off, &fp_off)) {
+ clear_overlapping_stack_slots(at_stack_out, 0, sz, 0);
+ break;
+ }
+ clear_overlapping_stack_slots(at_stack_out, fp_off, sz, cnt);
+ }
+}
+
+static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, int idx,
+ struct arg_track *at_in, struct arg_track *at_stack_in,
+ struct arg_track *at_out, struct arg_track *at_stack_out)
+{
+ bool printed = false;
+ int i;
+
+ if (!(env->log.level & BPF_LOG_LEVEL2))
+ return;
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (arg_track_eq(&at_out[i], &at_in[i]))
+ continue;
+ if (!printed) {
+ verbose(env, "%3d: ", idx);
+ bpf_verbose_insn(env, insn);
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+ printed = true;
+ }
+ verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]);
+ verbose(env, " -> "); verbose_arg_track(env, &at_out[i]);
+ }
+ for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
+ if (arg_track_eq(&at_stack_out[i], &at_stack_in[i]))
+ continue;
+ if (!printed) {
+ verbose(env, "%3d: ", idx);
+ bpf_verbose_insn(env, insn);
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+ printed = true;
+ }
+ verbose(env, "\tfp%+d: ", -(i + 1) * 8); verbose_arg_track(env, &at_stack_in[i]);
+ verbose(env, " -> "); verbose_arg_track(env, &at_stack_out[i]);
+ }
+ if (printed)
+ verbose(env, "\n");
+}
+
+static bool can_be_local_fp(int depth, int regno, struct arg_track *at)
+{
+ return regno == BPF_REG_FP || at->frame == depth ||
+ (at->frame == ARG_IMPRECISE && (at->mask & BIT(depth)));
+}
+
+/*
+ * Pure dataflow transfer function for arg_track state.
+ * Updates at_out[] based on how the instruction modifies registers.
+ * Tracks spill/fill, but not other memory accesses.
+ */
+static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int insn_idx,
+ struct arg_track *at_out, struct arg_track *at_stack_out,
+ struct func_instance *instance,
+ u32 *callsites)
+{
+ int depth = instance->depth;
+ u8 class = BPF_CLASS(insn->code);
+ u8 code = BPF_OP(insn->code);
+ struct arg_track *dst = &at_out[insn->dst_reg];
+ struct arg_track *src = &at_out[insn->src_reg];
+ struct arg_track none = { .frame = ARG_NONE };
+ int r;
+
+ if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) {
+ if (code == BPF_MOV) {
+ *dst = none;
+ } else if (dst->frame >= 0) {
+ if (code == BPF_ADD)
+ arg_padd(dst, insn->imm);
+ else if (code == BPF_SUB)
+ arg_padd(dst, -(s64)insn->imm);
+ else
+ /* Any other 64-bit alu on the pointer makes it imprecise */
+ dst->off_cnt = 0;
+ } /* else if dst->frame is imprecise it stays so */
+ } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_X) {
+ if (code == BPF_MOV) {
+ if (insn->off == 0) {
+ *dst = *src;
+ } else {
+ /* addr_space_cast destroys a pointer */
+ *dst = none;
+ }
+ } else {
+ arg_track_alu64(dst, src);
+ }
+ } else if (class == BPF_ALU) {
+ /*
+ * 32-bit alu destroys the pointer.
+ * If src was a pointer it cannot leak into dst
+ */
+ *dst = none;
+ } else if (class == BPF_JMP && code == BPF_CALL) {
+ /*
+ * at_stack_out[slot] is not cleared by the helper and subprog calls.
+ * The fill_from_stack() may return the stale spill — which is an FP-derived arg_track
+ * (the value that was originally spilled there). The loaded register then carries
+ * a phantom FP-derived identity that doesn't correspond to what's actually in the slot.
+ * This phantom FP pointer propagates forward, and wherever it's subsequently used
+ * (as a helper argument, another store, etc.), it sets stack liveness bits.
+ * Those bits correspond to stack accesses that don't actually happen.
+ * So the effect is over-reporting stack liveness — marking slots as live that aren't
+ * actually accessed. The verifier preserves more state than necessary across calls,
+ * which is conservative.
+ *
+ * helpers can scratch stack slots, but they won't make a valid pointer out of it.
+ * subprogs are allowed to write into parent slots, but they cannot write
+ * _any_ FP-derived pointer into it (either their own or parent's FP).
+ */
+ for (r = BPF_REG_0; r <= BPF_REG_5; r++)
+ at_out[r] = none;
+ } else if (class == BPF_LDX) {
+ u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ bool src_is_local_fp = can_be_local_fp(depth, insn->src_reg, src);
+
+ /*
+ * Reload from callee stack: if src is current-frame FP-derived
+ * and the load is an 8-byte BPF_MEM, try to restore the spill
+ * identity. For imprecise sources fill_from_stack() returns
+ * ARG_IMPRECISE (off_cnt == 0).
+ */
+ if (src_is_local_fp && BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+ *dst = fill_from_stack(insn, at_out, insn->src_reg, at_stack_out, depth);
+ } else if (src->frame >= 0 && src->frame < depth &&
+ BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+ struct arg_track *parent_stack =
+ env->callsite_at_stack[callsites[src->frame]];
+
+ *dst = fill_from_stack(insn, at_out, insn->src_reg,
+ parent_stack, src->frame);
+ } else if (src->frame == ARG_IMPRECISE &&
+ !(src->mask & BIT(depth)) && src->mask &&
+ BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+ /*
+ * Imprecise src with only parent-frame bits:
+ * conservative fallback.
+ */
+ *dst = *src;
+ } else {
+ *dst = none;
+ }
+ } else if (class == BPF_LD && BPF_MODE(insn->code) == BPF_IMM) {
+ *dst = none;
+ } else if (class == BPF_STX) {
+ u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ bool dst_is_local_fp;
+
+ /* Track spills to current-frame FP-derived callee stack */
+ dst_is_local_fp = can_be_local_fp(depth, insn->dst_reg, dst);
+ if (dst_is_local_fp && BPF_MODE(insn->code) == BPF_MEM)
+ spill_to_stack(insn, at_out, insn->dst_reg,
+ at_stack_out, src, sz);
+
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ if (dst_is_local_fp && insn->imm != BPF_LOAD_ACQ)
+ clear_stack_for_all_offs(insn, at_out, insn->dst_reg,
+ at_stack_out, sz);
+
+ if (insn->imm == BPF_CMPXCHG)
+ at_out[BPF_REG_0] = none;
+ else if (insn->imm == BPF_LOAD_ACQ)
+ *dst = none;
+ else if (insn->imm & BPF_FETCH)
+ *src = none;
+ }
+ } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
+ u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ bool dst_is_local_fp = can_be_local_fp(depth, insn->dst_reg, dst);
+
+ /* BPF_ST to FP-derived dst: clear overlapping stack slots */
+ if (dst_is_local_fp)
+ clear_stack_for_all_offs(insn, at_out, insn->dst_reg,
+ at_stack_out, sz);
+ }
+}
+
+/*
+ * Record access_bytes from helper/kfunc or load/store insn.
+ * access_bytes > 0: stack read
+ * access_bytes < 0: stack write
+ * access_bytes == S64_MIN: unknown — conservative, mark [0..slot] as read
+ * access_bytes == 0: no access
+ *
+ */
+static int record_stack_access_off(struct func_instance *instance, s64 fp_off,
+ s64 access_bytes, u32 frame, u32 insn_idx)
+{
+ s32 slot_hi, slot_lo;
+ spis_t mask;
+
+ if (fp_off >= 0)
+ /*
+ * out of bounds stack access doesn't contribute
+ * into actual stack liveness. It will be rejected
+ * by the main verifier pass later.
+ */
+ return 0;
+ if (access_bytes == S64_MIN) {
+ /* helper/kfunc read unknown amount of bytes from fp_off until fp+0 */
+ slot_hi = (-fp_off - 1) / STACK_SLOT_SZ;
+ mask = SPIS_ZERO;
+ spis_or_range(&mask, 0, slot_hi);
+ return mark_stack_read(instance, frame, insn_idx, mask);
+ }
+ if (access_bytes > 0) {
+ /* Mark any touched slot as use */
+ slot_hi = (-fp_off - 1) / STACK_SLOT_SZ;
+ slot_lo = max_t(s32, (-fp_off - access_bytes) / STACK_SLOT_SZ, 0);
+ mask = SPIS_ZERO;
+ spis_or_range(&mask, slot_lo, slot_hi);
+ return mark_stack_read(instance, frame, insn_idx, mask);
+ } else if (access_bytes < 0) {
+ /* Mark only fully covered slots as def */
+ access_bytes = -access_bytes;
+ slot_hi = (-fp_off) / STACK_SLOT_SZ - 1;
+ slot_lo = max_t(s32, (-fp_off - access_bytes + STACK_SLOT_SZ - 1) / STACK_SLOT_SZ, 0);
+ if (slot_lo <= slot_hi) {
+ mask = SPIS_ZERO;
+ spis_or_range(&mask, slot_lo, slot_hi);
+ return mark_stack_write(instance, frame, insn_idx, mask);
+ }
+ }
+ return 0;
+}
+
+/*
+ * 'arg' is FP-derived argument to helper/kfunc or load/store that
+ * reads (positive) or writes (negative) 'access_bytes' into 'use' or 'def'.
+ */
+static int record_stack_access(struct func_instance *instance,
+ const struct arg_track *arg,
+ s64 access_bytes, u32 frame, u32 insn_idx)
+{
+ int i, err;
+
+ if (access_bytes == 0)
+ return 0;
+ if (arg->off_cnt == 0) {
+ if (access_bytes > 0 || access_bytes == S64_MIN)
+ return mark_stack_read(instance, frame, insn_idx, SPIS_ALL);
+ return 0;
+ }
+ if (access_bytes != S64_MIN && access_bytes < 0 && arg->off_cnt != 1)
+ /* multi-offset write cannot set stack_def */
+ return 0;
+
+ for (i = 0; i < arg->off_cnt; i++) {
+ err = record_stack_access_off(instance, arg->off[i], access_bytes, frame, insn_idx);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/*
+ * When a pointer is ARG_IMPRECISE, conservatively mark every frame in
+ * the bitmask as fully used.
+ */
+static int record_imprecise(struct func_instance *instance, u32 mask, u32 insn_idx)
+{
+ int depth = instance->depth;
+ int f, err;
+
+ for (f = 0; mask; f++, mask >>= 1) {
+ if (!(mask & 1))
+ continue;
+ if (f <= depth) {
+ err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+/* Record load/store access for a given 'at' state of 'insn'. */
+static int record_load_store_access(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ struct arg_track *at, int insn_idx)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int depth = instance->depth;
+ s32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ u8 class = BPF_CLASS(insn->code);
+ struct arg_track resolved, *ptr;
+ int oi;
+
+ switch (class) {
+ case BPF_LDX:
+ ptr = &at[insn->src_reg];
+ break;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ if (insn->imm == BPF_STORE_REL)
+ sz = -sz;
+ if (insn->imm == BPF_LOAD_ACQ)
+ ptr = &at[insn->src_reg];
+ else
+ ptr = &at[insn->dst_reg];
+ } else {
+ ptr = &at[insn->dst_reg];
+ sz = -sz;
+ }
+ break;
+ case BPF_ST:
+ ptr = &at[insn->dst_reg];
+ sz = -sz;
+ break;
+ default:
+ return 0;
+ }
+
+ /* Resolve offsets: fold insn->off into arg_track */
+ if (ptr->off_cnt > 0) {
+ resolved.off_cnt = ptr->off_cnt;
+ resolved.frame = ptr->frame;
+ for (oi = 0; oi < ptr->off_cnt; oi++) {
+ if (arg_add(ptr->off[oi], insn->off, &resolved.off[oi])) {
+ resolved.off_cnt = 0;
+ break;
+ }
+ }
+ ptr = &resolved;
+ }
+
+ if (ptr->frame >= 0 && ptr->frame <= depth)
+ return record_stack_access(instance, ptr, sz, ptr->frame, insn_idx);
+ if (ptr->frame == ARG_IMPRECISE)
+ return record_imprecise(instance, ptr->mask, insn_idx);
+ /* ARG_NONE: not derived from any frame pointer, skip */
+ return 0;
+}
+
+/* Record stack access for a given 'at' state of helper/kfunc 'insn' */
+static int record_call_access(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ struct arg_track *at,
+ int insn_idx)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int depth = instance->depth;
+ struct bpf_call_summary cs;
+ int r, err = 0, num_params = 5;
+
+ if (bpf_pseudo_call(insn))
+ return 0;
+
+ if (bpf_get_call_summary(env, insn, &cs))
+ num_params = cs.num_params;
+
+ for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) {
+ int frame = at[r].frame;
+ s64 bytes;
+
+ if (!arg_is_fp(&at[r]))
+ continue;
+
+ if (bpf_helper_call(insn)) {
+ bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx);
+ } else if (bpf_pseudo_kfunc_call(insn)) {
+ bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx);
+ } else {
+ for (int f = 0; f <= depth; f++) {
+ err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
+ if (err)
+ return err;
+ }
+ return 0;
+ }
+ if (bytes == 0)
+ continue;
+
+ if (frame >= 0 && frame <= depth)
+ err = record_stack_access(instance, &at[r], bytes, frame, insn_idx);
+ else if (frame == ARG_IMPRECISE)
+ err = record_imprecise(instance, at[r].mask, insn_idx);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/*
+ * For a calls_callback helper, find the callback subprog and determine
+ * which caller register maps to which callback register for FP passthrough.
+ */
+static int find_callback_subprog(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int insn_idx,
+ int *caller_reg, int *callee_reg)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ int cb_reg = -1;
+
+ *caller_reg = -1;
+ *callee_reg = -1;
+
+ if (!bpf_helper_call(insn))
+ return -1;
+ switch (insn->imm) {
+ case BPF_FUNC_loop:
+ /* bpf_loop(nr, cb, ctx, flags): cb=R2, R3->cb R2 */
+ cb_reg = BPF_REG_2;
+ *caller_reg = BPF_REG_3;
+ *callee_reg = BPF_REG_2;
+ break;
+ case BPF_FUNC_for_each_map_elem:
+ /* for_each_map_elem(map, cb, ctx, flags): cb=R2, R3->cb R4 */
+ cb_reg = BPF_REG_2;
+ *caller_reg = BPF_REG_3;
+ *callee_reg = BPF_REG_4;
+ break;
+ case BPF_FUNC_find_vma:
+ /* find_vma(task, addr, cb, ctx, flags): cb=R3, R4->cb R3 */
+ cb_reg = BPF_REG_3;
+ *caller_reg = BPF_REG_4;
+ *callee_reg = BPF_REG_3;
+ break;
+ case BPF_FUNC_user_ringbuf_drain:
+ /* user_ringbuf_drain(map, cb, ctx, flags): cb=R2, R3->cb R2 */
+ cb_reg = BPF_REG_2;
+ *caller_reg = BPF_REG_3;
+ *callee_reg = BPF_REG_2;
+ break;
+ default:
+ return -1;
+ }
+
+ if (!(aux->const_reg_subprog_mask & BIT(cb_reg)))
+ return -2;
+
+ return aux->const_reg_vals[cb_reg];
+}
+
+/* Per-subprog intermediate state kept alive across analysis phases */
+struct subprog_at_info {
+ struct arg_track (*at_in)[MAX_BPF_REG];
+ int len;
+};
+
+static void print_subprog_arg_access(struct bpf_verifier_env *env,
+ int subprog,
+ struct subprog_at_info *info,
+ struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS])
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int start = env->subprog_info[subprog].start;
+ int len = info->len;
+ int i, r;
+
+ if (!(env->log.level & BPF_LOG_LEVEL2))
+ return;
+
+ verbose(env, "%s:\n", fmt_subprog(env, subprog));
+ for (i = 0; i < len; i++) {
+ int idx = start + i;
+ bool has_extra = false;
+ u8 cls = BPF_CLASS(insns[idx].code);
+ bool is_ldx_stx_call = cls == BPF_LDX || cls == BPF_STX ||
+ insns[idx].code == (BPF_JMP | BPF_CALL);
+
+ verbose(env, "%3d: ", idx);
+ bpf_verbose_insn(env, &insns[idx]);
+
+ /* Collect what needs printing */
+ if (is_ldx_stx_call &&
+ arg_is_visited(&info->at_in[i][0])) {
+ for (r = 0; r < MAX_BPF_REG - 1; r++)
+ if (arg_is_fp(&info->at_in[i][r]))
+ has_extra = true;
+ }
+ if (is_ldx_stx_call) {
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+ if (arg_is_fp(&at_stack_in[i][r]))
+ has_extra = true;
+ }
+
+ if (!has_extra) {
+ if (bpf_is_ldimm64(&insns[idx]))
+ i++;
+ continue;
+ }
+
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+ verbose(env, " //");
+
+ if (is_ldx_stx_call && info->at_in &&
+ arg_is_visited(&info->at_in[i][0])) {
+ for (r = 0; r < MAX_BPF_REG - 1; r++) {
+ if (!arg_is_fp(&info->at_in[i][r]))
+ continue;
+ verbose(env, " r%d=", r);
+ verbose_arg_track(env, &info->at_in[i][r]);
+ }
+ }
+
+ if (is_ldx_stx_call) {
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) {
+ if (!arg_is_fp(&at_stack_in[i][r]))
+ continue;
+ verbose(env, " fp%+d=", -(r + 1) * 8);
+ verbose_arg_track(env, &at_stack_in[i][r]);
+ }
+ }
+
+ verbose(env, "\n");
+ if (bpf_is_ldimm64(&insns[idx]))
+ i++;
+ }
+}
+
+/*
+ * Compute arg tracking dataflow for a single subprog.
+ * Runs forward fixed-point with arg_track_xfer(), then records
+ * memory accesses in a single linear pass over converged state.
+ *
+ * @callee_entry: pre-populated entry state for R1-R5
+ * NULL for main (subprog 0).
+ * @info: stores at_in, len for debug printing.
+ */
+static int compute_subprog_args(struct bpf_verifier_env *env,
+ struct subprog_at_info *info,
+ struct arg_track *callee_entry,
+ struct func_instance *instance,
+ u32 *callsites)
+{
+ int subprog = instance->subprog;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int depth = instance->depth;
+ int start = env->subprog_info[subprog].start;
+ int po_start = env->subprog_info[subprog].postorder_start;
+ int end = env->subprog_info[subprog + 1].start;
+ int po_end = env->subprog_info[subprog + 1].postorder_start;
+ int len = end - start;
+ struct arg_track (*at_in)[MAX_BPF_REG] = NULL;
+ struct arg_track at_out[MAX_BPF_REG];
+ struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL;
+ struct arg_track *at_stack_out = NULL;
+ struct arg_track unvisited = { .frame = ARG_UNVISITED };
+ struct arg_track none = { .frame = ARG_NONE };
+ bool changed;
+ int i, p, r, err = -ENOMEM;
+
+ at_in = kvmalloc_objs(*at_in, len, GFP_KERNEL_ACCOUNT);
+ if (!at_in)
+ goto err_free;
+
+ at_stack_in = kvmalloc_objs(*at_stack_in, len, GFP_KERNEL_ACCOUNT);
+ if (!at_stack_in)
+ goto err_free;
+
+ at_stack_out = kvmalloc_objs(*at_stack_out, MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT);
+ if (!at_stack_out)
+ goto err_free;
+
+ for (i = 0; i < len; i++) {
+ for (r = 0; r < MAX_BPF_REG; r++)
+ at_in[i][r] = unvisited;
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+ at_stack_in[i][r] = unvisited;
+ }
+
+ for (r = 0; r < MAX_BPF_REG; r++)
+ at_in[0][r] = none;
+
+ /* Entry: R10 is always precisely the current frame's FP */
+ at_in[0][BPF_REG_FP] = arg_single(depth, 0);
+
+ /* R1-R5: from caller or ARG_NONE for main */
+ if (callee_entry) {
+ for (r = BPF_REG_1; r <= BPF_REG_5; r++)
+ at_in[0][r] = callee_entry[r];
+ }
+
+ /* Entry: all stack slots are ARG_NONE */
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+ at_stack_in[0][r] = none;
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth);
+
+ /* Forward fixed-point iteration in reverse post order */
+redo:
+ changed = false;
+ for (p = po_end - 1; p >= po_start; p--) {
+ int idx = env->cfg.insn_postorder[p];
+ int i = idx - start;
+ struct bpf_insn *insn = &insns[idx];
+ struct bpf_iarray *succ;
+
+ if (!arg_is_visited(&at_in[i][0]) && !arg_is_visited(&at_in[i][1]))
+ continue;
+
+ memcpy(at_out, at_in[i], sizeof(at_out));
+ memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out));
+
+ arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites);
+ arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out);
+
+ /* Propagate to successors within this subprogram */
+ succ = bpf_insn_successors(env, idx);
+ for (int s = 0; s < succ->cnt; s++) {
+ int target = succ->items[s];
+ int ti;
+
+ /* Filter: stay within the subprogram's range */
+ if (target < start || target >= end)
+ continue;
+ ti = target - start;
+
+ for (r = 0; r < MAX_BPF_REG; r++)
+ changed |= arg_track_join(env, idx, target, r,
+ &at_in[ti][r], at_out[r]);
+
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+ changed |= arg_track_join(env, idx, target, -r - 1,
+ &at_stack_in[ti][r], at_stack_out[r]);
+ }
+ }
+ if (changed)
+ goto redo;
+
+ /* Record memory accesses using converged at_in (RPO skips dead code) */
+ for (p = po_end - 1; p >= po_start; p--) {
+ int idx = env->cfg.insn_postorder[p];
+ int i = idx - start;
+ struct bpf_insn *insn = &insns[idx];
+
+ err = record_load_store_access(env, instance, at_in[i], idx);
+ if (err)
+ goto err_free;
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ err = record_call_access(env, instance, at_in[i], idx);
+ if (err)
+ goto err_free;
+ }
+
+ if (bpf_pseudo_call(insn) || bpf_calls_callback(env, idx)) {
+ kvfree(env->callsite_at_stack[idx]);
+ env->callsite_at_stack[idx] =
+ kvmalloc_objs(*env->callsite_at_stack[idx],
+ MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT);
+ if (!env->callsite_at_stack[idx]) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+ memcpy(env->callsite_at_stack[idx],
+ at_stack_in[i], sizeof(struct arg_track) * MAX_ARG_SPILL_SLOTS);
+ }
+ }
+
+ info->at_in = at_in;
+ at_in = NULL;
+ info->len = len;
+ print_subprog_arg_access(env, subprog, info, at_stack_in);
+ err = 0;
+
+err_free:
+ kvfree(at_stack_out);
+ kvfree(at_stack_in);
+ kvfree(at_in);
+ return err;
+}
+
+/* Return true if any of R1-R5 is derived from a frame pointer. */
+static bool has_fp_args(struct arg_track *args)
+{
+ for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+ if (args[r].frame != ARG_NONE)
+ return true;
+ return false;
+}
+
+/*
+ * Merge a freshly analyzed instance into the original.
+ * may_read: union (any pass might read the slot).
+ * must_write: intersection (only slots written on ALL passes are guaranteed).
+ * live_before is recomputed by a subsequent update_instance() on @dst.
+ */
+static void merge_instances(struct func_instance *dst, struct func_instance *src)
+{
+ int f, i;
+
+ for (f = 0; f <= dst->depth; f++) {
+ if (!src->frames[f]) {
+ /* This pass didn't touch frame f — must_write intersects with empty. */
+ if (dst->frames[f])
+ for (i = 0; i < dst->insn_cnt; i++)
+ dst->frames[f][i].must_write = SPIS_ZERO;
+ continue;
+ }
+ if (!dst->frames[f]) {
+ /* Previous pass didn't touch frame f — take src, zero must_write. */
+ dst->frames[f] = src->frames[f];
+ src->frames[f] = NULL;
+ for (i = 0; i < dst->insn_cnt; i++)
+ dst->frames[f][i].must_write = SPIS_ZERO;
+ continue;
+ }
+ for (i = 0; i < dst->insn_cnt; i++) {
+ dst->frames[f][i].may_read =
+ spis_or(dst->frames[f][i].may_read,
+ src->frames[f][i].may_read);
+ dst->frames[f][i].must_write =
+ spis_and(dst->frames[f][i].must_write,
+ src->frames[f][i].must_write);
+ }
+ }
+}
+
+static struct func_instance *fresh_instance(struct func_instance *src)
+{
+ struct func_instance *f;
+
+ f = kvzalloc_obj(*f, GFP_KERNEL_ACCOUNT);
+ if (!f)
+ return ERR_PTR(-ENOMEM);
+ f->callsite = src->callsite;
+ f->depth = src->depth;
+ f->subprog = src->subprog;
+ f->subprog_start = src->subprog_start;
+ f->insn_cnt = src->insn_cnt;
+ return f;
+}
+
+static void free_instance(struct func_instance *instance)
+{
+ int i;
+
+ for (i = 0; i <= instance->depth; i++)
+ kvfree(instance->frames[i]);
+ kvfree(instance);
+}
+
+/*
+ * Recursively analyze a subprog with specific 'entry_args'.
+ * Each callee is analyzed with the exact args from its call site.
+ *
+ * Args are recomputed for each call because the dataflow result at_in[]
+ * depends on the entry args and frame depth. Consider: A->C->D and B->C->D
+ * Callsites in A and B pass different args into C, so C is recomputed.
+ * Then within C the same callsite passes different args into D.
+ */
+static int analyze_subprog(struct bpf_verifier_env *env,
+ struct arg_track *entry_args,
+ struct subprog_at_info *info,
+ struct func_instance *instance,
+ u32 *callsites)
+{
+ int subprog = instance->subprog;
+ int depth = instance->depth;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int start = env->subprog_info[subprog].start;
+ int po_start = env->subprog_info[subprog].postorder_start;
+ int po_end = env->subprog_info[subprog + 1].postorder_start;
+ struct func_instance *prev_instance = NULL;
+ int j, err;
+
+ if (++env->liveness->subprog_calls > 10000) {
+ verbose(env, "liveness analysis exceeded complexity limit (%d calls)\n",
+ env->liveness->subprog_calls);
+ return -E2BIG;
+ }
+
+ if (need_resched())
+ cond_resched();
+
+
+ /*
+ * When an instance is reused (must_write_initialized == true),
+ * record into a fresh instance and merge afterward. This avoids
+ * stale must_write marks for instructions not reached in this pass.
+ */
+ if (instance->must_write_initialized) {
+ struct func_instance *fresh = fresh_instance(instance);
+
+ if (IS_ERR(fresh))
+ return PTR_ERR(fresh);
+ prev_instance = instance;
+ instance = fresh;
+ }
+
+ /* Free prior analysis if this subprog was already visited */
+ kvfree(info[subprog].at_in);
+ info[subprog].at_in = NULL;
+
+ err = compute_subprog_args(env, &info[subprog], entry_args, instance, callsites);
+ if (err)
+ goto out_free;
+
+ /* For each reachable call site in the subprog, recurse into callees */
+ for (int p = po_start; p < po_end; p++) {
+ int idx = env->cfg.insn_postorder[p];
+ struct arg_track callee_args[BPF_REG_5 + 1];
+ struct arg_track none = { .frame = ARG_NONE };
+ struct bpf_insn *insn = &insns[idx];
+ struct func_instance *callee_instance;
+ int callee, target;
+ int caller_reg, cb_callee_reg;
+
+ j = idx - start; /* relative index within this subprog */
+
+ if (bpf_pseudo_call(insn)) {
+ target = idx + insn->imm + 1;
+ callee = bpf_find_subprog(env, target);
+ if (callee < 0)
+ continue;
+
+ /* Build entry args: R1-R5 from at_in at call site */
+ for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+ callee_args[r] = info[subprog].at_in[j][r];
+ } else if (bpf_calls_callback(env, idx)) {
+ callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg);
+ if (callee == -2) {
+ /*
+ * same bpf_loop() calls two different callbacks and passes
+ * stack pointer to them
+ */
+ if (info[subprog].at_in[j][caller_reg].frame == ARG_NONE)
+ continue;
+ for (int f = 0; f <= depth; f++) {
+ err = mark_stack_read(instance, f, idx, SPIS_ALL);
+ if (err)
+ goto out_free;
+ }
+ continue;
+ }
+ if (callee < 0)
+ continue;
+
+ for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+ callee_args[r] = none;
+ callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg];
+ } else {
+ continue;
+ }
+
+ if (!has_fp_args(callee_args))
+ continue;
+
+ if (depth == MAX_CALL_FRAMES - 1) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ callee_instance = call_instance(env, instance, idx, callee);
+ if (IS_ERR(callee_instance)) {
+ err = PTR_ERR(callee_instance);
+ goto out_free;
+ }
+ callsites[depth] = idx;
+ err = analyze_subprog(env, callee_args, info, callee_instance, callsites);
+ if (err)
+ goto out_free;
+
+ /* Pull callee's entry liveness back to caller's callsite */
+ {
+ u32 callee_start = callee_instance->subprog_start;
+ struct per_frame_masks *entry;
+
+ for (int f = 0; f < callee_instance->depth; f++) {
+ entry = get_frame_masks(callee_instance, f, callee_start);
+ if (!entry)
+ continue;
+ err = mark_stack_read(instance, f, idx, entry->live_before);
+ if (err)
+ goto out_free;
+ }
+ }
+ }
+
+ if (prev_instance) {
+ merge_instances(prev_instance, instance);
+ free_instance(instance);
+ instance = prev_instance;
+ }
+ update_instance(env, instance);
+ return 0;
+
+out_free:
+ if (prev_instance)
+ free_instance(instance);
+ return err;
+}
+
+int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env)
+{
+ u32 callsites[MAX_CALL_FRAMES] = {};
+ int insn_cnt = env->prog->len;
+ struct func_instance *instance;
+ struct subprog_at_info *info;
+ int k, err = 0;
+
+ info = kvzalloc_objs(*info, env->subprog_cnt, GFP_KERNEL_ACCOUNT);
+ if (!info)
+ return -ENOMEM;
+
+ env->callsite_at_stack = kvzalloc_objs(*env->callsite_at_stack, insn_cnt,
+ GFP_KERNEL_ACCOUNT);
+ if (!env->callsite_at_stack) {
+ kvfree(info);
+ return -ENOMEM;
+ }
+
+ instance = call_instance(env, NULL, 0, 0);
+ if (IS_ERR(instance)) {
+ err = PTR_ERR(instance);
+ goto out;
+ }
+ err = analyze_subprog(env, NULL, info, instance, callsites);
+ if (err)
+ goto out;
+
+ /*
+ * Subprogs and callbacks that don't receive FP-derived arguments
+ * cannot access ancestor stack frames, so they were skipped during
+ * the recursive walk above. Async callbacks (timer, workqueue) are
+ * also not reachable from the main program's call graph. Analyze
+ * all unvisited subprogs as independent roots at depth 0.
+ *
+ * Use reverse topological order (callers before callees) so that
+ * each subprog is analyzed before its callees, allowing the
+ * recursive walk inside analyze_subprog() to naturally
+ * reach nested callees that also lack FP-derived args.
+ */
+ for (k = env->subprog_cnt - 1; k >= 0; k--) {
+ int sub = env->subprog_topo_order[k];
+
+ if (info[sub].at_in && !bpf_subprog_is_global(env, sub))
+ continue;
+ instance = call_instance(env, NULL, 0, sub);
+ if (IS_ERR(instance)) {
+ err = PTR_ERR(instance);
+ goto out;
+ }
+ err = analyze_subprog(env, NULL, info, instance, callsites);
+ if (err)
+ goto out;
+ }
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ err = print_instances(env);
+
+out:
+ for (k = 0; k < insn_cnt; k++)
+ kvfree(env->callsite_at_stack[k]);
+ kvfree(env->callsite_at_stack);
+ env->callsite_at_stack = NULL;
+ for (k = 0; k < env->subprog_cnt; k++)
+ kvfree(info[k].at_in);
+ kvfree(info);
+ return err;
+}
+
+/* Each field is a register bitmask */
+struct insn_live_regs {
+ u16 use; /* registers read by instruction */
+ u16 def; /* registers written by instruction */
+ u16 in; /* registers that may be alive before instruction */
+ u16 out; /* registers that may be alive after instruction */
+};
+
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* Compute info->{use,def} fields for the instruction */
+static void compute_insn_live_regs(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct insn_live_regs *info)
+{
+ struct bpf_call_summary cs;
+ u8 class = BPF_CLASS(insn->code);
+ u8 code = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u16 src = BIT(insn->src_reg);
+ u16 dst = BIT(insn->dst_reg);
+ u16 r0 = BIT(0);
+ u16 def = 0;
+ u16 use = 0xffff;
+
+ switch (class) {
+ case BPF_LD:
+ switch (mode) {
+ case BPF_IMM:
+ if (BPF_SIZE(insn->code) == BPF_DW) {
+ def = dst;
+ use = 0;
+ }
+ break;
+ case BPF_LD | BPF_ABS:
+ case BPF_LD | BPF_IND:
+ /* stick with defaults */
+ break;
+ }
+ break;
+ case BPF_LDX:
+ switch (mode) {
+ case BPF_MEM:
+ case BPF_MEMSX:
+ def = dst;
+ use = src;
+ break;
+ }
+ break;
+ case BPF_ST:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst;
+ break;
+ }
+ break;
+ case BPF_STX:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst | src;
+ break;
+ case BPF_ATOMIC:
+ switch (insn->imm) {
+ case BPF_CMPXCHG:
+ use = r0 | dst | src;
+ def = r0;
+ break;
+ case BPF_LOAD_ACQ:
+ def = dst;
+ use = src;
+ break;
+ case BPF_STORE_REL:
+ def = 0;
+ use = dst | src;
+ break;
+ default:
+ use = dst | src;
+ if (insn->imm & BPF_FETCH)
+ def = src;
+ else
+ def = 0;
+ }
+ break;
+ }
+ break;
+ case BPF_ALU:
+ case BPF_ALU64:
+ switch (code) {
+ case BPF_END:
+ use = dst;
+ def = dst;
+ break;
+ case BPF_MOV:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = 0;
+ else
+ use = src;
+ break;
+ default:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ case BPF_JMP:
+ case BPF_JMP32:
+ switch (code) {
+ case BPF_JA:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_X)
+ use = dst;
+ else
+ use = 0;
+ break;
+ case BPF_JCOND:
+ def = 0;
+ use = 0;
+ break;
+ case BPF_EXIT:
+ def = 0;
+ use = r0;
+ break;
+ case BPF_CALL:
+ def = ALL_CALLER_SAVED_REGS;
+ use = def & ~BIT(BPF_REG_0);
+ if (bpf_get_call_summary(env, insn, &cs))
+ use = GENMASK(cs.num_params, 1);
+ break;
+ default:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ }
+
+ info->def = def;
+ info->use = use;
+}
+
+/* Compute may-live registers after each instruction in the program.
+ * The register is live after the instruction I if it is read by some
+ * instruction S following I during program execution and is not
+ * overwritten between I and S.
+ *
+ * Store result in env->insn_aux_data[i].live_regs.
+ */
+int bpf_compute_live_registers(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct insn_live_regs *state;
+ int insn_cnt = env->prog->len;
+ int err = 0, i, j;
+ bool changed;
+
+ /* Use the following algorithm:
+ * - define the following:
+ * - I.use : a set of all registers read by instruction I;
+ * - I.def : a set of all registers written by instruction I;
+ * - I.in : a set of all registers that may be alive before I execution;
+ * - I.out : a set of all registers that may be alive after I execution;
+ * - insn_successors(I): a set of instructions S that might immediately
+ * follow I for some program execution;
+ * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
+ * - visit each instruction in a postorder and update
+ * state[i].in, state[i].out as follows:
+ *
+ * state[i].out = U [state[s].in for S in insn_successors(i)]
+ * state[i].in = (state[i].out / state[i].def) U state[i].use
+ *
+ * (where U stands for set union, / stands for set difference)
+ * - repeat the computation while {in,out} fields changes for
+ * any instruction.
+ */
+ state = kvzalloc_objs(*state, insn_cnt, GFP_KERNEL_ACCOUNT);
+ if (!state) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ compute_insn_live_regs(env, &insns[i], &state[i]);
+
+ /* Forward pass: resolve stack access through FP-derived pointers */
+ err = bpf_compute_subprog_arg_access(env);
+ if (err)
+ goto out;
+
+ changed = true;
+ while (changed) {
+ changed = false;
+ for (i = 0; i < env->cfg.cur_postorder; ++i) {
+ int insn_idx = env->cfg.insn_postorder[i];
+ struct insn_live_regs *live = &state[insn_idx];
+ struct bpf_iarray *succ;
+ u16 new_out = 0;
+ u16 new_in = 0;
+
+ succ = bpf_insn_successors(env, insn_idx);
+ for (int s = 0; s < succ->cnt; ++s)
+ new_out |= state[succ->items[s]].in;
+ new_in = (new_out & ~live->def) | live->use;
+ if (new_out != live->out || new_in != live->in) {
+ live->in = new_in;
+ live->out = new_out;
+ changed = true;
+ }
+ }
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ insn_aux[i].live_regs_before = state[i].in;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "Live regs before insn:\n");
+ for (i = 0; i < insn_cnt; ++i) {
+ if (env->insn_aux_data[i].scc)
+ verbose(env, "%3d ", env->insn_aux_data[i].scc);
+ else
+ verbose(env, " ");
+ verbose(env, "%3d: ", i);
+ for (j = BPF_REG_0; j < BPF_REG_10; ++j)
+ if (insn_aux[i].live_regs_before & BIT(j))
+ verbose(env, "%d", j);
+ else
+ verbose(env, ".");
+ verbose(env, " ");
+ bpf_verbose_insn(env, &insns[i]);
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+ }
+
+out:
+ kvfree(state);
+ return err;
+}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 1ccbf28b2ad9..23267213a17f 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -270,7 +270,7 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key,
goto enoent;
storage = list_next_entry(storage, list_map);
- if (!storage)
+ if (list_entry_is_head(storage, &map->list, list_map))
goto enoent;
} else {
storage = list_first_entry(&map->list,
@@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
-static int cgroup_storage_check_btf(const struct bpf_map *map,
+static int cgroup_storage_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index a0c3b35de2ce..011e4ec25acd 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -329,47 +329,6 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
}
EXPORT_SYMBOL_GPL(bpf_log);
-static const struct bpf_line_info *
-find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
-{
- const struct bpf_line_info *linfo;
- const struct bpf_prog *prog;
- u32 nr_linfo;
- int l, r, m;
-
- prog = env->prog;
- nr_linfo = prog->aux->nr_linfo;
-
- if (!nr_linfo || insn_off >= prog->len)
- return NULL;
-
- linfo = prog->aux->linfo;
- /* Loop invariant: linfo[l].insn_off <= insns_off.
- * linfo[0].insn_off == 0 which always satisfies above condition.
- * Binary search is searching for rightmost linfo entry that satisfies
- * the above invariant, giving us the desired record that covers given
- * instruction offset.
- */
- l = 0;
- r = nr_linfo - 1;
- while (l < r) {
- /* (r - l + 1) / 2 means we break a tie to the right, so if:
- * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
- * then m=2, we see that linfo[m].insn_off > insn_off, and so
- * r becomes 1 and we exit the loop with correct l==1.
- * If the tie was broken to the left, m=1 would end us up in
- * an endless loop where l and m stay at 1 and r stays at 2.
- */
- m = l + (r - l + 1) / 2;
- if (linfo[m].insn_off <= insn_off)
- l = m;
- else
- r = m - 1;
- }
-
- return &linfo[l];
-}
-
static const char *ltrim(const char *s)
{
while (isspace(*s))
@@ -390,7 +349,7 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
return;
prev_linfo = env->prev_linfo;
- linfo = find_linfo(env, insn_off);
+ linfo = bpf_find_linfo(env->prog, insn_off);
if (!linfo || linfo == prev_linfo)
return;
@@ -542,7 +501,8 @@ static char slot_type_char[] = {
[STACK_ZERO] = '0',
[STACK_DYNPTR] = 'd',
[STACK_ITER] = 'i',
- [STACK_IRQ_FLAG] = 'f'
+ [STACK_IRQ_FLAG] = 'f',
+ [STACK_POISON] = 'p',
};
#define UNUM_MAX_DECIMAL U16_MAX
@@ -581,6 +541,8 @@ int tnum_strn(char *str, size_t size, struct tnum a)
if (a.mask == 0) {
if (is_unum_decimal(a.value))
return snprintf(str, size, "%llu", a.value);
+ if (is_snum_decimal(a.value))
+ return snprintf(str, size, "%lld", a.value);
else
return snprintf(str, size, "%#llx", a.value);
}
@@ -692,7 +654,7 @@ static void print_reg_state(struct bpf_verifier_env *env,
if (state->frameno != reg->frameno)
verbose(env, "[%d]", reg->frameno);
if (tnum_is_const(reg->var_off)) {
- verbose_snum(env, reg->var_off.value + reg->off);
+ verbose_snum(env, reg->var_off.value + reg->delta);
return;
}
}
@@ -702,7 +664,7 @@ static void print_reg_state(struct bpf_verifier_env *env,
if (reg->id)
verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
if (reg->id & BPF_ADD_CONST)
- verbose(env, "%+d", reg->off);
+ verbose(env, "%+d", reg->delta);
if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (type_is_non_owning_ref(reg->type))
@@ -714,9 +676,9 @@ static void print_reg_state(struct bpf_verifier_env *env,
reg->map_ptr->key_size,
reg->map_ptr->value_size);
}
- if (t != SCALAR_VALUE && reg->off) {
+ if (t != SCALAR_VALUE && reg->delta) {
verbose_a("off=");
- verbose_snum(env, reg->off);
+ verbose_snum(env, reg->delta);
}
if (type_is_pkt_pointer(t)) {
verbose_a("r=");
@@ -777,7 +739,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
for (j = 0; j < BPF_REG_SIZE; j++) {
slot_type = state->stack[i].slot_type[j];
- if (slot_type != STACK_INVALID)
+ if (slot_type != STACK_INVALID && slot_type != STACK_POISON)
valid = true;
types_buf[j] = slot_type_char[slot_type];
}
@@ -845,7 +807,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
mark_verifier_state_clean(env);
}
-static inline u32 vlog_alignment(u32 pos)
+u32 bpf_vlog_alignment(u32 pos)
{
return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
BPF_LOG_MIN_ALIGNMENT) - pos - 1;
@@ -857,7 +819,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
/* remove new line character */
bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
- verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+ verbose(env, "%*c;", bpf_vlog_alignment(env->prev_insn_print_pos), ' ');
} else {
verbose(env, "%d:", env->insn_idx);
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1adeb4d3b8cf..0f57608b385d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -751,7 +751,7 @@ free_stack:
return err;
}
-static int trie_check_btf(const struct bpf_map *map,
+static int trie_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index bd45dda9dc35..e9662db7198f 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -102,6 +102,8 @@ struct bpf_mem_cache {
int percpu_size;
bool draining;
struct bpf_mem_cache *tgt;
+ void (*dtor)(void *obj, void *ctx);
+ void *dtor_ctx;
/* list of objects to be freed after RCU GP */
struct llist_head free_by_rcu;
@@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu)
kfree(obj);
}
-static int free_all(struct llist_node *llnode, bool percpu)
+static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu)
{
struct llist_node *pos, *t;
int cnt = 0;
llist_for_each_safe(pos, t, llnode) {
+ if (c->dtor)
+ c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx);
free_one(pos, percpu);
cnt++;
}
@@ -276,21 +280,10 @@ static void __free_rcu(struct rcu_head *head)
{
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
- free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+ free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
atomic_set(&c->call_rcu_ttrace_in_progress, 0);
}
-static void __free_rcu_tasks_trace(struct rcu_head *head)
-{
- /* If RCU Tasks Trace grace period implies RCU grace period,
- * there is no need to invoke call_rcu().
- */
- if (rcu_trace_implies_rcu_gp())
- __free_rcu(head);
- else
- call_rcu(head, __free_rcu);
-}
-
static void enque_to_free(struct bpf_mem_cache *c, void *obj)
{
struct llist_node *llnode = obj;
@@ -308,7 +301,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
if (unlikely(READ_ONCE(c->draining))) {
llnode = llist_del_all(&c->free_by_rcu_ttrace);
- free_all(llnode, !!c->percpu_size);
+ free_all(c, llnode, !!c->percpu_size);
}
return;
}
@@ -322,12 +315,12 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
return;
}
- /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
- * If RCU Tasks Trace grace period implies RCU grace period, free
- * these elements directly, else use call_rcu() to wait for normal
- * progs to finish and finally do free_one() on each element.
+ /*
+ * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * RCU Tasks Trace grace period implies RCU grace period, so pass
+ * __free_rcu directly as the callback.
*/
- call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
+ call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu);
}
static void free_bulk(struct bpf_mem_cache *c)
@@ -417,7 +410,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c)
dec_active(c, &flags);
if (unlikely(READ_ONCE(c->draining))) {
- free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+ free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
atomic_set(&c->call_rcu_in_progress, 0);
} else {
call_rcu_hurry(&c->rcu, __free_by_rcu);
@@ -635,13 +628,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
* Except for waiting_for_gp_ttrace list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
- free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
- free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
- free_all(__llist_del_all(&c->free_llist), percpu);
- free_all(__llist_del_all(&c->free_llist_extra), percpu);
- free_all(__llist_del_all(&c->free_by_rcu), percpu);
- free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
- free_all(llist_del_all(&c->waiting_for_gp), percpu);
+ free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+ free_all(c, __llist_del_all(&c->free_llist), percpu);
+ free_all(c, __llist_del_all(&c->free_llist_extra), percpu);
+ free_all(c, __llist_del_all(&c->free_by_rcu), percpu);
+ free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu);
+ free_all(c, llist_del_all(&c->waiting_for_gp), percpu);
}
static void check_mem_cache(struct bpf_mem_cache *c)
@@ -680,6 +673,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma)
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{
+ /* We can free dtor ctx only once all callbacks are done using it. */
+ if (ma->dtor_ctx_free)
+ ma->dtor_ctx_free(ma->dtor_ctx);
check_leaked_objs(ma);
free_percpu(ma->cache);
free_percpu(ma->caches);
@@ -689,20 +685,18 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
- /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+ /*
+ * waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
* might still execute. Wait for them.
*
* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
- * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
- * so if call_rcu(head, __free_rcu) is skipped due to
- * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
- * using rcu_trace_implies_rcu_gp() as well.
+ * to wait for the pending __free_by_rcu(), and __free_rcu(). RCU Tasks
+ * Trace grace period implies RCU grace period, so all __free_rcu don't
+ * need extra call_rcu() (and thus extra rcu_barrier() here).
*/
rcu_barrier(); /* wait for __free_by_rcu */
rcu_barrier_tasks_trace(); /* wait for __free_rcu */
- if (!rcu_trace_implies_rcu_gp())
- rcu_barrier();
free_mem_alloc_no_barrier(ma);
}
@@ -1014,3 +1008,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size)
return 0;
}
+
+void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx),
+ void (*dtor_ctx_free)(void *ctx), void *ctx)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ ma->dtor_ctx_free = dtor_ctx_free;
+ ma->dtor_ctx = ctx;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ c->dtor = dtor;
+ c->dtor_ctx = ctx;
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->dtor = dtor;
+ c->dtor_ctx = ctx;
+ }
+ }
+ }
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 0ad97d643bf4..0d6f5569588c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -435,9 +435,8 @@ static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
if (aux->offload) {
args->info->ifindex = aux->offload->netdev->ifindex;
- net = dev_net(aux->offload->netdev);
- get_net(net);
- ns = &net->ns;
+ net = maybe_get_net(dev_net(aux->offload->netdev));
+ ns = net ? &net->ns : NULL;
} else {
args->info->ifindex = 0;
ns = NULL;
@@ -647,9 +646,8 @@ static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
if (args->offmap->netdev) {
args->info->ifindex = args->offmap->netdev->ifindex;
- net = dev_net(args->offmap->netdev);
- get_net(net);
- ns = &net->ns;
+ net = maybe_get_net(dev_net(args->offmap->netdev));
+ ns = net ? &net->ns : NULL;
} else {
args->info->ifindex = 0;
ns = NULL;
diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c
new file mode 100644
index 000000000000..8478d2c6ed5b
--- /dev/null
+++ b/kernel/bpf/states.c
@@ -0,0 +1,1563 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+#define BPF_COMPLEXITY_LIMIT_STATES 64
+
+static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
+{
+ return bpf_is_may_goto_insn(&env->prog->insnsi[insn_idx]);
+}
+
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+static void update_peak_states(struct bpf_verifier_env *env)
+{
+ u32 cur_states;
+
+ cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
+ env->peak_states = max(env->peak_states, cur_states);
+}
+
+/* struct bpf_verifier_state->parent refers to states
+ * that are in either of env->{expored_states,free_list}.
+ * In both cases the state is contained in struct bpf_verifier_state_list.
+ */
+static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
+{
+ if (st->parent)
+ return container_of(st->parent, struct bpf_verifier_state_list, state);
+ return NULL;
+}
+
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st);
+
+/* A state can be freed if it is no longer referenced:
+ * - is in the env->free_list;
+ * - has no children states;
+ */
+static void maybe_free_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state_list *sl)
+{
+ if (!sl->in_free_list
+ || sl->state.branches != 0
+ || incomplete_read_marks(env, &sl->state))
+ return;
+ list_del(&sl->node);
+ bpf_free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->free_list_size--;
+}
+
+/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
+ * if such frame exists form a corresponding @callchain as an array of
+ * call sites leading to this frame and SCC id.
+ * E.g.:
+ *
+ * void foo() { A: loop {... SCC#1 ...}; }
+ * void bar() { B: loop { C: foo(); ... SCC#2 ... }
+ * D: loop { E: foo(); ... SCC#3 ... } }
+ * void main() { F: bar(); }
+ *
+ * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
+ * on @st frame call sites being (F,C,A) or (F,E,A).
+ */
+static bool compute_scc_callchain(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_callchain *callchain)
+{
+ u32 i, scc, insn_idx;
+
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= st->curframe; i++) {
+ insn_idx = bpf_frame_insn_idx(st, i);
+ scc = env->insn_aux_data[insn_idx].scc;
+ if (scc) {
+ callchain->scc = scc;
+ break;
+ } else if (i < st->curframe) {
+ callchain->callsites[i] = insn_idx;
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Check if bpf_scc_visit instance for @callchain exists. */
+static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
+{
+ struct bpf_scc_info *info = env->scc_info[callchain->scc];
+ struct bpf_scc_visit *visits = info->visits;
+ u32 i;
+
+ if (!info)
+ return NULL;
+ for (i = 0; i < info->num_visits; i++)
+ if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
+ return &visits[i];
+ return NULL;
+}
+
+/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
+ * Allocated instances are alive for a duration of the do_check_common()
+ * call and are freed by free_states().
+ */
+static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
+{
+ struct bpf_scc_visit *visit;
+ struct bpf_scc_info *info;
+ u32 scc, num_visits;
+ u64 new_sz;
+
+ scc = callchain->scc;
+ info = env->scc_info[scc];
+ num_visits = info ? info->num_visits : 0;
+ new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
+ info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT);
+ if (!info)
+ return NULL;
+ env->scc_info[scc] = info;
+ info->num_visits = num_visits + 1;
+ visit = &info->visits[num_visits];
+ memset(visit, 0, sizeof(*visit));
+ memcpy(&visit->callchain, callchain, sizeof(*callchain));
+ return visit;
+}
+
+/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
+static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
+{
+ char *buf = env->tmp_str_buf;
+ int i, delta = 0;
+
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
+ for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
+ if (!callchain->callsites[i])
+ break;
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
+ callchain->callsites[i]);
+ }
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
+ return env->tmp_str_buf;
+}
+
+/* If callchain for @st exists (@st is in some SCC), ensure that
+ * bpf_scc_visit instance for this callchain exists.
+ * If instance does not exist or is empty, assign visit->entry_state to @st.
+ */
+static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ visit = visit ?: scc_visit_alloc(env, callchain);
+ if (!visit)
+ return -ENOMEM;
+ if (!visit->entry_state) {
+ visit->entry_state = st;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
+ }
+ return 0;
+}
+
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);
+
+/* If callchain for @st exists (@st is in some SCC), make it empty:
+ * - set visit->entry_state to NULL;
+ * - flush accumulated backedges.
+ */
+static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ /*
+ * If path traversal stops inside an SCC, corresponding bpf_scc_visit
+ * must exist for non-speculative paths. For non-speculative paths
+ * traversal stops when:
+ * a. Verification error is found, maybe_exit_scc() is not called.
+ * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
+ * of any SCC.
+ * c. A checkpoint is reached and matched. Checkpoints are created by
+ * is_state_visited(), which calls maybe_enter_scc(), which allocates
+ * bpf_scc_visit instances for checkpoints within SCCs.
+ * (c) is the only case that can reach this point.
+ */
+ if (!st->speculative) {
+ verifier_bug(env, "scc exit: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ return 0;
+ }
+ if (visit->entry_state != st)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
+ visit->entry_state = NULL;
+ env->num_backedges -= visit->num_backedges;
+ visit->num_backedges = 0;
+ update_peak_states(env);
+ return propagate_backedges(env, visit);
+}
+
+/* Lookup an bpf_scc_visit instance corresponding to @st callchain
+ * and add @backedge to visit->backedges. @st callchain must exist.
+ */
+static int add_scc_backedge(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_backedge *backedge)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain)) {
+ verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
+ st->insn_idx);
+ return -EFAULT;
+ }
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ verifier_bug(env, "add backedge: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
+ backedge->next = visit->backedges;
+ visit->backedges = backedge;
+ visit->num_backedges++;
+ env->num_backedges++;
+ update_peak_states(env);
+ return 0;
+}
+
+/* bpf_reg_state->live marks for registers in a state @st are incomplete,
+ * if state @st is in some SCC and not all execution paths starting at this
+ * SCC are fully explored.
+ */
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return false;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit)
+ return false;
+ return !!visit->backedges;
+}
+
+int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_verifier_state_list *sl = NULL, *parent_sl;
+ struct bpf_verifier_state *parent;
+ int err;
+
+ while (st) {
+ u32 br = --st->branches;
+
+ /* verifier_bug_if(br > 1, ...) technically makes sense here,
+ * but see comment in push_stack(), hence:
+ */
+ verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br);
+ if (br)
+ break;
+ err = maybe_exit_scc(env, st);
+ if (err)
+ return err;
+ parent = st->parent;
+ parent_sl = state_parent_as_list(st);
+ if (sl)
+ maybe_free_verifier_state(env, sl);
+ st = parent;
+ sl = parent_sl;
+ }
+ return 0;
+}
+
+/* check %cur's range satisfies %old's */
+static bool range_within(const struct bpf_reg_state *old,
+ const struct bpf_reg_state *cur)
+{
+ return old->umin_value <= cur->umin_value &&
+ old->umax_value >= cur->umax_value &&
+ old->smin_value <= cur->smin_value &&
+ old->smax_value >= cur->smax_value &&
+ old->u32_min_value <= cur->u32_min_value &&
+ old->u32_max_value >= cur->u32_max_value &&
+ old->s32_min_value <= cur->s32_min_value &&
+ old->s32_max_value >= cur->s32_max_value;
+}
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well. But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe. But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before. If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
+ */
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ struct bpf_id_pair *map = idmap->map;
+ unsigned int i;
+
+ /* either both IDs should be set or both should be zero */
+ if (!!old_id != !!cur_id)
+ return false;
+
+ if (old_id == 0) /* cur_id == 0 as well */
+ return true;
+
+ for (i = 0; i < idmap->cnt; i++) {
+ if (map[i].old == old_id)
+ return map[i].cur == cur_id;
+ if (map[i].cur == cur_id)
+ return false;
+ }
+
+ /* Reached the end of known mappings; haven't seen this id before */
+ if (idmap->cnt < BPF_ID_MAP_SIZE) {
+ map[idmap->cnt].old = old_id;
+ map[idmap->cnt].cur = cur_id;
+ idmap->cnt++;
+ return true;
+ }
+
+ /* We ran out of idmap slots, which should be impossible */
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+/*
+ * Compare scalar register IDs for state equivalence.
+ *
+ * When old_id == 0, the old register is independent - not linked to any
+ * other register. Any linking in the current state only adds constraints,
+ * making it more restrictive. Since the old state didn't rely on any ID
+ * relationships for this register, it's always safe to accept cur regardless
+ * of its ID. Hence, return true immediately.
+ *
+ * When old_id != 0 but cur_id == 0, we need to ensure that different
+ * independent registers in cur don't incorrectly satisfy the ID matching
+ * requirements of linked registers in old.
+ *
+ * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0
+ * and r7.id=0 (both independent), without temp IDs both would map old_id=X
+ * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map
+ * X->temp2, but X is already mapped to temp1, so the check fails correctly.
+ *
+ * When old_id has BPF_ADD_CONST set, the compound id (base | flag) and the
+ * base id (flag stripped) must both map consistently. Example: old has
+ * r2.id=A, r3.id=A|flag (r3 = r2 + delta), cur has r2.id=B, r3.id=C|flag
+ * (r3 derived from unrelated r4). Without the base check, idmap gets two
+ * independent entries A->B and A|flag->C|flag, missing that A->C conflicts
+ * with A->B. The base ID cross-check catches this.
+ */
+static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ if (!old_id)
+ return true;
+
+ cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
+
+ if (!check_ids(old_id, cur_id, idmap))
+ return false;
+ if (old_id & BPF_ADD_CONST) {
+ old_id &= ~BPF_ADD_CONST;
+ cur_id &= ~BPF_ADD_CONST;
+ if (!check_ids(old_id, cur_id, idmap))
+ return false;
+ }
+ return true;
+}
+
+static void __clean_func_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *st,
+ u16 live_regs, int frame)
+{
+ int i, j;
+
+ for (i = 0; i < BPF_REG_FP; i++) {
+ /* liveness must not touch this register anymore */
+ if (!(live_regs & BIT(i)))
+ /* since the register is unused, clear its state
+ * to make further comparison simpler
+ */
+ bpf_mark_reg_not_init(env, &st->regs[i]);
+ }
+
+ /*
+ * Clean dead 4-byte halves within each SPI independently.
+ * half_spi 2*i → lower half: slot_type[0..3] (closer to FP)
+ * half_spi 2*i+1 → upper half: slot_type[4..7] (farther from FP)
+ */
+ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
+ bool lo_live = bpf_stack_slot_alive(env, frame, i * 2);
+ bool hi_live = bpf_stack_slot_alive(env, frame, i * 2 + 1);
+
+ if (!hi_live || !lo_live) {
+ int start = !lo_live ? 0 : BPF_REG_SIZE / 2;
+ int end = !hi_live ? BPF_REG_SIZE : BPF_REG_SIZE / 2;
+ u8 stype = st->stack[i].slot_type[7];
+
+ /*
+ * Don't clear special slots.
+ * destroy_if_dynptr_stack_slot() needs STACK_DYNPTR to
+ * detect overwrites and invalidate associated data slices.
+ * is_iter_reg_valid_uninit() and is_irq_flag_reg_valid_uninit()
+ * check for their respective slot types to detect double-create.
+ */
+ if (stype == STACK_DYNPTR || stype == STACK_ITER ||
+ stype == STACK_IRQ_FLAG)
+ continue;
+
+ /*
+ * Only destroy spilled_ptr when hi half is dead.
+ * If hi half is still live with STACK_SPILL, the
+ * spilled_ptr metadata is needed for correct state
+ * comparison in stacksafe().
+ * is_spilled_reg() is using slot_type[7], but
+ * is_spilled_scalar_after() check either slot_type[0] or [4]
+ */
+ if (!hi_live) {
+ struct bpf_reg_state *spill = &st->stack[i].spilled_ptr;
+
+ if (lo_live && stype == STACK_SPILL) {
+ u8 val = STACK_MISC;
+
+ /*
+ * 8 byte spill of scalar 0 where half slot is dead
+ * should become STACK_ZERO in lo 4 bytes.
+ */
+ if (bpf_register_is_null(spill))
+ val = STACK_ZERO;
+ for (j = 0; j < 4; j++) {
+ u8 *t = &st->stack[i].slot_type[j];
+
+ if (*t == STACK_SPILL)
+ *t = val;
+ }
+ }
+ bpf_mark_reg_not_init(env, spill);
+ }
+ for (j = start; j < end; j++)
+ st->stack[i].slot_type[j] = STACK_POISON;
+ }
+ }
+}
+
+static int clean_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ int i, err;
+
+ err = bpf_live_stack_query_init(env, st);
+ if (err)
+ return err;
+ for (i = 0; i <= st->curframe; i++) {
+ u32 ip = bpf_frame_insn_idx(st, i);
+ u16 live_regs = env->insn_aux_data[ip].live_regs_before;
+
+ __clean_func_state(env, st->frame[i], live_regs, i);
+ }
+ return 0;
+}
+
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+}
+
+enum exact_level {
+ NOT_EXACT,
+ EXACT,
+ RANGE_WITHIN
+};
+
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
+ enum exact_level exact)
+{
+ if (exact == EXACT)
+ return regs_exact(rold, rcur, idmap);
+
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+
+ /* Enforce that register types have to match exactly, including their
+ * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
+ * rule.
+ *
+ * One can make a point that using a pointer register as unbounded
+ * SCALAR would be technically acceptable, but this could lead to
+ * pointer leaks because scalars are allowed to leak while pointers
+ * are not. We could make this safe in special cases if root is
+ * calling us, but it's probably not worth the hassle.
+ *
+ * Also, register types that are *not* MAYBE_NULL could technically be
+ * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
+ * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
+ * to the same map).
+ * However, if the old MAYBE_NULL register then got NULL checked,
+ * doing so could have affected others with the same id, and we can't
+ * check for that because we lost the id when we converted to
+ * a non-MAYBE_NULL variant.
+ * So, as a general rule we don't allow mixing MAYBE_NULL and
+ * non-MAYBE_NULL registers as well.
+ */
+ if (rold->type != rcur->type)
+ return false;
+
+ switch (base_type(rold->type)) {
+ case SCALAR_VALUE:
+ if (env->explore_alu_limits) {
+ /* explore_alu_limits disables tnum_in() and range_within()
+ * logic and requires everything to be strict
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
+ }
+ if (!rold->precise && exact == NOT_EXACT)
+ return true;
+ /*
+ * Linked register tracking uses rold->id to detect relationships.
+ * When rold->id == 0, the register is independent and any linking
+ * in rcur only adds constraints. When rold->id != 0, we must verify
+ * id mapping and (for BPF_ADD_CONST) offset consistency.
+ *
+ * +------------------+-----------+------------------+---------------+
+ * | | rold->id | rold + ADD_CONST | rold->id == 0 |
+ * |------------------+-----------+------------------+---------------|
+ * | rcur->id | range,ids | false | range |
+ * | rcur + ADD_CONST | false | range,ids,off | range |
+ * | rcur->id == 0 | range,ids | false | range |
+ * +------------------+-----------+------------------+---------------+
+ *
+ * Why check_ids() for scalar registers?
+ *
+ * Consider the following BPF code:
+ * 1: r6 = ... unbound scalar, ID=a ...
+ * 2: r7 = ... unbound scalar, ID=b ...
+ * 3: if (r6 > r7) goto +1
+ * 4: r6 = r7
+ * 5: if (r6 > X) goto ...
+ * 6: ... memory operation using r7 ...
+ *
+ * First verification path is [1-6]:
+ * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
+ * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
+ * r7 <= X, because r6 and r7 share same id.
+ * Next verification path is [1-4, 6].
+ *
+ * Instruction (6) would be reached in two states:
+ * I. r6{.id=b}, r7{.id=b} via path 1-6;
+ * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
+ *
+ * Use check_ids() to distinguish these states.
+ * ---
+ * Also verify that new value satisfies old value range knowledge.
+ */
+
+ /*
+ * ADD_CONST flags must match exactly: BPF_ADD_CONST32 and
+ * BPF_ADD_CONST64 have different linking semantics in
+ * sync_linked_regs() (alu32 zero-extends, alu64 does not),
+ * so pruning across different flag types is unsafe.
+ */
+ if (rold->id &&
+ (rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
+ return false;
+
+ /* Both have offset linkage: offsets must match */
+ if ((rold->id & BPF_ADD_CONST) && rold->delta != rcur->delta)
+ return false;
+
+ if (!check_scalar_ids(rold->id, rcur->id, idmap))
+ return false;
+
+ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_TP_BUFFER:
+ /* If the new min/max/var_off satisfy the old ones and
+ * everything else matches, we are OK.
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET:
+ /* We must have at least as much range as the old ptr
+ * did, so that any accesses which were safe before are
+ * still safe. This is true even if old range < old off,
+ * since someone could have accessed through (ptr - k), or
+ * even done ptr -= k in a register, to get a safe access.
+ */
+ if (rold->range < 0 || rcur->range < 0) {
+ /* special case for [BEYOND|AT]_PKT_END */
+ if (rold->range != rcur->range)
+ return false;
+ } else if (rold->range > rcur->range) {
+ return false;
+ }
+ /* id relations must be preserved */
+ if (!check_ids(rold->id, rcur->id, idmap))
+ return false;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
+ */
+ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
+ case PTR_TO_ARENA:
+ return true;
+ case PTR_TO_INSN:
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
+ default:
+ return regs_exact(rold, rcur, idmap);
+ }
+}
+
+static struct bpf_reg_state unbound_reg;
+
+static __init int unbound_reg_init(void)
+{
+ bpf_mark_reg_unknown_imprecise(&unbound_reg);
+ return 0;
+}
+late_initcall(unbound_reg_init);
+
+static bool is_spilled_scalar_after(const struct bpf_stack_state *stack, int im)
+{
+ return stack->slot_type[im] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
+static bool is_stack_misc_after(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack, int im)
+{
+ u32 i;
+
+ for (i = im; i < ARRAY_SIZE(stack->slot_type); ++i) {
+ if ((stack->slot_type[i] == STACK_MISC) ||
+ ((stack->slot_type[i] == STACK_INVALID || stack->slot_type[i] == STACK_POISON) &&
+ env->allow_uninit_stack))
+ continue;
+ return false;
+ }
+
+ return true;
+}
+
+static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack, int im)
+{
+ if (is_spilled_scalar_after(stack, im))
+ return &stack->spilled_ptr;
+
+ if (is_stack_misc_after(env, stack, im))
+ return &unbound_reg;
+
+ return NULL;
+}
+
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_idmap *idmap,
+ enum exact_level exact)
+{
+ int i, spi;
+
+ /* walk slots of the explored stack and ignore any additional
+ * slots in the current stack, since explored(safe) state
+ * didn't use them
+ */
+ for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+ int im = i % BPF_REG_SIZE;
+
+ spi = i / BPF_REG_SIZE;
+
+ if (exact == EXACT) {
+ u8 old_type = old->stack[spi].slot_type[i % BPF_REG_SIZE];
+ u8 cur_type = i < cur->allocated_stack ?
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE] : STACK_INVALID;
+
+ /* STACK_INVALID and STACK_POISON are equivalent for pruning */
+ if (old_type == STACK_POISON)
+ old_type = STACK_INVALID;
+ if (cur_type == STACK_POISON)
+ cur_type = STACK_INVALID;
+ if (i >= cur->allocated_stack || old_type != cur_type)
+ return false;
+ }
+
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID ||
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_POISON)
+ continue;
+
+ if (env->allow_uninit_stack &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+
+ /* explored stack has more populated slots than current stack
+ * and these slots were used
+ */
+ if (i >= cur->allocated_stack)
+ return false;
+
+ /*
+ * 64 and 32-bit scalar spills vs MISC/INVALID slots and vice versa.
+ * Load from MISC/INVALID slots produces unbound scalar.
+ * Construct a fake register for such stack and call
+ * regsafe() to ensure scalar ids are compared.
+ */
+ if (im == 0 || im == 4) {
+ old_reg = scalar_reg_for_stack(env, &old->stack[spi], im);
+ cur_reg = scalar_reg_for_stack(env, &cur->stack[spi], im);
+ if (old_reg && cur_reg) {
+ if (!regsafe(env, old_reg, cur_reg, idmap, exact))
+ return false;
+ i += (im == 0 ? BPF_REG_SIZE - 1 : 3);
+ continue;
+ }
+ }
+
+ /* if old state was safe with misc data in the stack
+ * it will be safe with zero-initialized stack.
+ * The opposite is not true
+ */
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
+ continue;
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
+ continue;
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+ * but current path has stored:
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap, exact))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_IRQ_FLAG:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
+ old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
+ return false;
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ case STACK_POISON:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
+ struct bpf_idmap *idmap)
+{
+ int i;
+
+ if (old->acquired_refs != cur->acquired_refs)
+ return false;
+
+ if (old->active_locks != cur->active_locks)
+ return false;
+
+ if (old->active_preempt_locks != cur->active_preempt_locks)
+ return false;
+
+ if (old->active_rcu_locks != cur->active_rcu_locks)
+ return false;
+
+ if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
+ return false;
+
+ if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
+ old->active_lock_ptr != cur->active_lock_ptr)
+ return false;
+
+ for (i = 0; i < old->acquired_refs; i++) {
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
+ old->refs[i].type != cur->refs[i].type)
+ return false;
+ switch (old->refs[i].type) {
+ case REF_TYPE_PTR:
+ case REF_TYPE_IRQ:
+ break;
+ case REF_TYPE_LOCK:
+ case REF_TYPE_RES_LOCK:
+ case REF_TYPE_RES_LOCK_IRQ:
+ if (old->refs[i].ptr != cur->refs[i].ptr)
+ return false;
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ * explored current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
+{
+ u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
+ u16 i;
+
+ if (old->callback_depth > cur->callback_depth)
+ return false;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (((1 << i) & live_regs) &&
+ !regsafe(env, &old->regs[i], &cur->regs[i],
+ &env->idmap_scratch, exact))
+ return false;
+
+ if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
+ return false;
+
+ return true;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+ struct bpf_idmap *idmap = &env->idmap_scratch;
+
+ idmap->tmp_id_gen = env->id_gen;
+ idmap->cnt = 0;
+}
+
+static bool states_equal(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ enum exact_level exact)
+{
+ u32 insn_idx;
+ int i;
+
+ if (old->curframe != cur->curframe)
+ return false;
+
+ reset_idmap_scratch(env);
+
+ /* Verification state from speculative execution simulation
+ * must never prune a non-speculative execution one.
+ */
+ if (old->speculative && !cur->speculative)
+ return false;
+
+ if (old->in_sleepable != cur->in_sleepable)
+ return false;
+
+ if (!refsafe(old, cur, &env->idmap_scratch))
+ return false;
+
+ /* for states to be equal callsites have to be the same
+ * and all frame states need to be equivalent
+ */
+ for (i = 0; i <= old->curframe; i++) {
+ insn_idx = bpf_frame_insn_idx(old, i);
+ if (old->frame[i]->callsite != cur->frame[i]->callsite)
+ return false;
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
+ return false;
+ }
+ return true;
+}
+
+/* find precise scalars in the previous equivalent state and
+ * propagate them into the current state
+ */
+static int propagate_precision(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ bool *changed)
+{
+ struct bpf_reg_state *state_reg;
+ struct bpf_func_state *state;
+ int i, err = 0, fr;
+ bool first;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ state_reg = state->regs;
+ first = true;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating r%d", fr, i);
+ else
+ verbose(env, ",r%d", i);
+ }
+ bpf_bt_set_frame_reg(&env->bt, fr, i);
+ first = false;
+ }
+
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!bpf_is_spilled_reg(&state->stack[i]))
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating fp%d",
+ fr, (-i - 1) * BPF_REG_SIZE);
+ else
+ verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
+ }
+ bpf_bt_set_frame_slot(&env->bt, fr, i);
+ first = false;
+ }
+ if (!first && (env->log.level & BPF_LOG_LEVEL2))
+ verbose(env, "\n");
+ }
+
+ err = bpf_mark_chain_precision(env, cur, -1, changed);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+#define MAX_BACKEDGE_ITERS 64
+
+/* Propagate read and precision marks from visit->backedges[*].state->equal_state
+ * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
+ * then free visit->backedges.
+ * After execution of this function incomplete_read_marks() will return false
+ * for all states corresponding to @visit->callchain.
+ */
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
+{
+ struct bpf_scc_backedge *backedge;
+ struct bpf_verifier_state *st;
+ bool changed;
+ int i, err;
+
+ i = 0;
+ do {
+ if (i++ > MAX_BACKEDGE_ITERS) {
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "%s: too many iterations\n", __func__);
+ for (backedge = visit->backedges; backedge; backedge = backedge->next)
+ bpf_mark_all_scalars_precise(env, &backedge->state);
+ break;
+ }
+ changed = false;
+ for (backedge = visit->backedges; backedge; backedge = backedge->next) {
+ st = &backedge->state;
+ err = propagate_precision(env, st->equal_state, st, &changed);
+ if (err)
+ return err;
+ }
+ } while (changed);
+
+ bpf_free_backedges(visit);
+ return 0;
+}
+
+static bool states_maybe_looping(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr = cur->curframe;
+
+ if (old->curframe != fr)
+ return false;
+
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (memcmp(&fold->regs[i], &fcur->regs[i],
+ offsetof(struct bpf_reg_state, frameno)))
+ return false;
+ return true;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "infinite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * infinitely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
+
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
+
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!bpf_is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
+}
+
+int bpf_is_state_visited(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state_list *new_sl;
+ struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *cur = env->cur_state, *new;
+ bool force_new_state, add_new_state, loop;
+ int n, err, states_cnt = 0;
+ struct list_head *pos, *tmp, *head;
+
+ force_new_state = env->test_state_freq || bpf_is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->jmp_history_cnt > 40;
+
+ /* bpf progs typically have pruning point every 4 instructions
+ * http://vger.kernel.org/bpfconf2019.html#session-1
+ * Do not add new state for future pruning if the verifier hasn't seen
+ * at least 2 jumps and at least 8 instructions.
+ * This heuristics helps decrease 'total_states' and 'peak_states' metric.
+ * In tests that amounts to up to 50% reduction into total verifier
+ * memory consumption and 20% verifier time speedup.
+ */
+ add_new_state = force_new_state;
+ if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
+ env->insn_processed - env->prev_insn_processed >= 8)
+ add_new_state = true;
+
+ /* keep cleaning the current state as registers/stack become dead */
+ err = clean_verifier_state(env, cur);
+ if (err)
+ return err;
+
+ loop = false;
+ head = bpf_explored_state(env, insn_idx);
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
+ states_cnt++;
+ if (sl->state.insn_idx != insn_idx)
+ continue;
+
+ if (sl->state.branches) {
+ struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+ if (frame->in_async_callback_fn &&
+ frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+ /* Different async_entry_cnt means that the verifier is
+ * processing another entry into async callback.
+ * Seeing the same state is not an indication of infinite
+ * loop or infinite recursion.
+ * But finding the same state doesn't mean that it's safe
+ * to stop processing the current state. The previous state
+ * hasn't yet reached bpf_exit, since state.branches > 0.
+ * Checking in_async_callback_fn alone is not enough either.
+ * Since the verifier still needs to catch infinite loops
+ * inside async callbacks.
+ */
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ *
+ * Note, that states have to be compared exactly in this case because
+ * read and precision marks might not be finalized inside the loop.
+ * E.g. as in the program below:
+ *
+ * 1. r7 = -16
+ * 2. r6 = bpf_get_prandom_u32()
+ * 3. while (bpf_iter_num_next(&fp[-8])) {
+ * 4. if (r6 != 42) {
+ * 5. r7 = -32
+ * 6. r6 = bpf_get_prandom_u32()
+ * 7. continue
+ * 8. }
+ * 9. r0 = r10
+ * 10. r0 += r7
+ * 11. r8 = *(u64 *)(r0 + 0)
+ * 12. r6 = bpf_get_prandom_u32()
+ * 13. }
+ *
+ * Here verifier would first visit path 1-3, create a checkpoint at 3
+ * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+ * not have read or precision mark for r7 yet, thus inexact states
+ * comparison would discard current state with r7=-32
+ * => unsafe memory access at 11 would not be caught.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = bpf_get_spi(iter_reg->var_off.value);
+ iter_state = &bpf_func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+ loop = true;
+ goto hit;
+ }
+ }
+ goto skip_inf_loop_check;
+ }
+ if (is_may_goto_insn_at(env, insn_idx)) {
+ if (sl->state.may_goto_depth != cur->may_goto_depth &&
+ states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ loop = true;
+ goto hit;
+ }
+ }
+ if (bpf_calls_callback(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ loop = true;
+ goto hit;
+ }
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur, EXACT) &&
+ !iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.may_goto_depth == cur->may_goto_depth &&
+ sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
+ verbose_linfo(env, insn_idx, "; ");
+ verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ verbose(env, "cur state:");
+ print_verifier_state(env, cur, cur->curframe, true);
+ verbose(env, "old state:");
+ print_verifier_state(env, &sl->state, cur->curframe, true);
+ return -EINVAL;
+ }
+ /* if the verifier is processing a loop, avoid adding new state
+ * too often, since different loop iterations have distinct
+ * states and may not help future pruning.
+ * This threshold shouldn't be too low to make sure that
+ * a loop with large bound will be rejected quickly.
+ * The most abusive loop will be:
+ * r1 += 1
+ * if r1 < 1000000 goto pc-2
+ * 1M insn_procssed limit / 100 == 10k peak states.
+ * This threshold shouldn't be too high either, since states
+ * at the end of the loop are likely to be useful in pruning.
+ */
+skip_inf_loop_check:
+ if (!force_new_state &&
+ env->jmps_processed - env->prev_jmps_processed < 20 &&
+ env->insn_processed - env->prev_insn_processed < 100)
+ add_new_state = false;
+ goto miss;
+ }
+ /* See comments for mark_all_regs_read_and_precise() */
+ loop = incomplete_read_marks(env, &sl->state);
+ if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
+hit:
+ sl->hit_cnt++;
+
+ /* if previous state reached the exit with precision and
+ * current state is equivalent to it (except precision marks)
+ * the precision needs to be propagated back in
+ * the current state.
+ */
+ err = 0;
+ if (bpf_is_jmp_point(env, env->insn_idx))
+ err = bpf_push_jmp_history(env, cur, 0, 0);
+ err = err ? : propagate_precision(env, &sl->state, cur, NULL);
+ if (err)
+ return err;
+ /* When processing iterator based loops above propagate_liveness and
+ * propagate_precision calls are not sufficient to transfer all relevant
+ * read and precision marks. E.g. consider the following case:
+ *
+ * .-> A --. Assume the states are visited in the order A, B, C.
+ * | | | Assume that state B reaches a state equivalent to state A.
+ * | v v At this point, state C is not processed yet, so state A
+ * '-- B C has not received any read or precision marks from C.
+ * Thus, marks propagated from A to B are incomplete.
+ *
+ * The verifier mitigates this by performing the following steps:
+ *
+ * - Prior to the main verification pass, strongly connected components
+ * (SCCs) are computed over the program's control flow graph,
+ * intraprocedurally.
+ *
+ * - During the main verification pass, `maybe_enter_scc()` checks
+ * whether the current verifier state is entering an SCC. If so, an
+ * instance of a `bpf_scc_visit` object is created, and the state
+ * entering the SCC is recorded as the entry state.
+ *
+ * - This instance is associated not with the SCC itself, but with a
+ * `bpf_scc_callchain`: a tuple consisting of the call sites leading to
+ * the SCC and the SCC id. See `compute_scc_callchain()`.
+ *
+ * - When a verification path encounters a `states_equal(...,
+ * RANGE_WITHIN)` condition, there exists a call chain describing the
+ * current state and a corresponding `bpf_scc_visit` instance. A copy
+ * of the current state is created and added to
+ * `bpf_scc_visit->backedges`.
+ *
+ * - When a verification path terminates, `maybe_exit_scc()` is called
+ * from `bpf_update_branch_counts()`. For states with `branches == 0`, it
+ * checks whether the state is the entry state of any `bpf_scc_visit`
+ * instance. If it is, this indicates that all paths originating from
+ * this SCC visit have been explored. `propagate_backedges()` is then
+ * called, which propagates read and precision marks through the
+ * backedges until a fixed point is reached.
+ * (In the earlier example, this would propagate marks from A to B,
+ * from C to A, and then again from A to B.)
+ *
+ * A note on callchains
+ * --------------------
+ *
+ * Consider the following example:
+ *
+ * void foo() { loop { ... SCC#1 ... } }
+ * void main() {
+ * A: foo();
+ * B: ...
+ * C: foo();
+ * }
+ *
+ * Here, there are two distinct callchains leading to SCC#1:
+ * - (A, SCC#1)
+ * - (C, SCC#1)
+ *
+ * Each callchain identifies a separate `bpf_scc_visit` instance that
+ * accumulates backedge states. The `propagate_{liveness,precision}()`
+ * functions traverse the parent state of each backedge state, which
+ * means these parent states must remain valid (i.e., not freed) while
+ * the corresponding `bpf_scc_visit` instance exists.
+ *
+ * Associating `bpf_scc_visit` instances directly with SCCs instead of
+ * callchains would break this invariant:
+ * - States explored during `C: foo()` would contribute backedges to
+ * SCC#1, but SCC#1 would only be exited once the exploration of
+ * `A: foo()` completes.
+ * - By that time, the states explored between `A: foo()` and `C: foo()`
+ * (i.e., `B: ...`) may have already been freed, causing the parent
+ * links for states from `C: foo()` to become invalid.
+ */
+ if (loop) {
+ struct bpf_scc_backedge *backedge;
+
+ backedge = kzalloc_obj(*backedge,
+ GFP_KERNEL_ACCOUNT);
+ if (!backedge)
+ return -ENOMEM;
+ err = bpf_copy_verifier_state(&backedge->state, cur);
+ backedge->state.equal_state = &sl->state;
+ backedge->state.insn_idx = insn_idx;
+ err = err ?: add_scc_backedge(env, &sl->state, backedge);
+ if (err) {
+ bpf_free_verifier_state(&backedge->state, false);
+ kfree(backedge);
+ return err;
+ }
+ }
+ return 1;
+ }
+miss:
+ /* when new state is not going to be added do not increase miss count.
+ * Otherwise several loop iterations will remove the state
+ * recorded earlier. The goal of these heuristics is to have
+ * states from some iterations of the loop (some in the beginning
+ * and some at the end) to help pruning.
+ */
+ if (add_new_state)
+ sl->miss_cnt++;
+ /* heuristic to determine whether this state is beneficial
+ * to keep checking from state equivalence point of view.
+ * Higher numbers increase max_states_per_insn and verification time,
+ * but do not meaningfully decrease insn_processed.
+ * 'n' controls how many times state could miss before eviction.
+ * Use bigger 'n' for checkpoints because evicting checkpoint states
+ * too early would hinder iterator convergence.
+ */
+ n = bpf_is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+ if (sl->miss_cnt > sl->hit_cnt * n + n) {
+ /* the state is unlikely to be useful. Remove it to
+ * speed up verification
+ */
+ sl->in_free_list = true;
+ list_del(&sl->node);
+ list_add(&sl->node, &env->free_list);
+ env->free_list_size++;
+ env->explored_states_size--;
+ maybe_free_verifier_state(env, sl);
+ }
+ }
+
+ if (env->max_states_per_insn < states_cnt)
+ env->max_states_per_insn = states_cnt;
+
+ if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+ return 0;
+
+ if (!add_new_state)
+ return 0;
+
+ /* There were no equivalent states, remember the current one.
+ * Technically the current state is not proven to be safe yet,
+ * but it will either reach outer most bpf_exit (which means it's safe)
+ * or it will be rejected. When there are no loops the verifier won't be
+ * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
+ * again on the way to bpf_exit.
+ * When looping the sl->state.branches will be > 0 and this state
+ * will not be considered for equivalence until branches == 0.
+ */
+ new_sl = kzalloc_obj(struct bpf_verifier_state_list, GFP_KERNEL_ACCOUNT);
+ if (!new_sl)
+ return -ENOMEM;
+ env->total_states++;
+ env->explored_states_size++;
+ update_peak_states(env);
+ env->prev_jmps_processed = env->jmps_processed;
+ env->prev_insn_processed = env->insn_processed;
+
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
+
+ bpf_clear_singular_ids(env, cur);
+
+ /* add new state to the head of linked list */
+ new = &new_sl->state;
+ err = bpf_copy_verifier_state(new, cur);
+ if (err) {
+ bpf_free_verifier_state(new, false);
+ kfree(new_sl);
+ return err;
+ }
+ new->insn_idx = insn_idx;
+ verifier_bug_if(new->branches != 1, env,
+ "%s:branches_to_explore=%d insn %d",
+ __func__, new->branches, insn_idx);
+ err = maybe_enter_scc(env, new);
+ if (err) {
+ bpf_free_verifier_state(new, false);
+ kfree(new_sl);
+ return err;
+ }
+
+ cur->parent = new;
+ cur->first_insn_idx = insn_idx;
+ cur->dfs_depth = new->dfs_depth + 1;
+ bpf_clear_jmp_history(cur);
+ list_add(&new_sl->node, head);
+ return 0;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0378e83b4099..a3c0214ca934 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -941,14 +941,6 @@ static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
}
-static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
-{
- if (rcu_trace_implies_rcu_gp())
- bpf_map_free_rcu_gp(rcu);
- else
- call_rcu(rcu, bpf_map_free_rcu_gp);
-}
-
/* decrement map refcnt and schedule it for freeing via workqueue
* (underlying map implementation ops->map_free() might sleep)
*/
@@ -959,8 +951,9 @@ void bpf_map_put(struct bpf_map *map)
bpf_map_free_id(map);
WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
+ /* RCU tasks trace grace period implies RCU grace period. */
if (READ_ONCE(map->free_after_mult_rcu_gp))
- call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+ call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp);
else if (READ_ONCE(map->free_after_rcu_gp))
call_rcu(&map->rcu, bpf_map_free_rcu_gp);
else
@@ -1234,7 +1227,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
}
EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
-int map_check_no_btf(const struct bpf_map *map,
+int map_check_no_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
@@ -2832,7 +2825,7 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
sig = kvmemdup_bpfptr(usig, attr->signature_size);
if (IS_ERR(sig)) {
bpf_key_put(key);
- return -ENOMEM;
+ return PTR_ERR(sig);
}
bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
@@ -3090,10 +3083,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (err < 0)
goto free_used_maps;
- prog = bpf_prog_select_runtime(prog, &err);
- if (err < 0)
- goto free_used_maps;
-
err = bpf_prog_mark_insn_arrays_ready(prog);
if (err < 0)
goto free_used_maps;
@@ -3261,12 +3250,16 @@ static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
bpf_link_dealloc(link);
}
-static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+static bool bpf_link_is_tracepoint(struct bpf_link *link)
{
- if (rcu_trace_implies_rcu_gp())
- bpf_link_defer_dealloc_rcu_gp(rcu);
- else
- call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+ /*
+ * Only these combinations support a tracepoint bpf_link.
+ * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use
+ * bpf_raw_tp_link_lops and thus dealloc_deferred(), see
+ * bpf_raw_tp_link_attach().
+ */
+ return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT ||
+ (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP);
}
/* bpf_link_free is guaranteed to be called from process context */
@@ -3279,16 +3272,26 @@ static void bpf_link_free(struct bpf_link *link)
if (link->prog)
ops->release(link);
if (ops->dealloc_deferred) {
- /* Schedule BPF link deallocation, which will only then
+ /*
+ * Schedule BPF link deallocation, which will only then
* trigger putting BPF program refcount.
* If underlying BPF program is sleepable or BPF link's target
* attach hookpoint is sleepable or otherwise requires RCU GPs
* to ensure link and its underlying BPF program is not
* reachable anymore, we need to first wait for RCU tasks
- * trace sync, and then go through "classic" RCU grace period
+ * trace sync, and then go through "classic" RCU grace period.
+ *
+ * For tracepoint BPF links, we need to go through SRCU grace
+ * period wait instead when non-faultable tracepoint is used. We
+ * don't need to chain SRCU grace period waits, however, for the
+ * faultable case, since it exclusively uses RCU Tasks Trace.
*/
if (link->sleepable || (link->prog && link->prog->sleepable))
- call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ /* RCU Tasks Trace grace period implies RCU grace period. */
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */
+ else if (bpf_link_is_tracepoint(link))
+ call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
else
call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
} else if (ops->dealloc) {
@@ -3733,6 +3736,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
tr = prog->aux->dst_trampoline;
tgt_prog = prog->aux->dst_prog;
}
+ /*
+ * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true
+ * freplace prog. Without this check, kprobe_write_ctx=true freplace
+ * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and
+ * then modify the registers of the kprobe prog's target kernel
+ * function.
+ *
+ * This also blocks the combination of uprobe+freplace, because it is
+ * unable to recognize the use of the tgt_prog as an uprobe or a kprobe
+ * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by
+ * the target perf event flags in __perf_event_set_bpf_prog().
+ */
+ if (prog->type == BPF_PROG_TYPE_EXT &&
+ prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
err = bpf_link_prime(&link->link.link, &link_primer);
if (err)
@@ -6348,8 +6368,7 @@ static bool syscall_prog_is_valid_access(int off, int size,
{
if (off < 0 || off >= U16_MAX)
return false;
- if (off % size != 0)
- return false;
+ /* No alignment requirements for syscall ctx accesses. */
return true;
}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 98d9b4c0daff..e791ae065c39 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -9,6 +9,8 @@
#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
#include <linux/mm_types.h>
+#include <linux/mmap_lock.h>
+#include <linux/sched/mm.h>
#include "mmap_unlock_work.h"
static const char * const iter_task_type_names[] = {
@@ -794,11 +796,20 @@ const struct bpf_func_proto bpf_find_vma_proto = {
.arg5_type = ARG_ANYTHING,
};
+static inline void bpf_iter_mmput_async(struct mm_struct *mm)
+{
+#ifdef CONFIG_MMU
+ mmput_async(mm);
+#else
+ mmput(mm);
+#endif
+}
+
struct bpf_iter_task_vma_kern_data {
struct task_struct *task;
struct mm_struct *mm;
- struct mmap_unlock_irq_work *work;
- struct vma_iterator vmi;
+ struct vm_area_struct snapshot;
+ u64 next_addr;
};
struct bpf_iter_task_vma {
@@ -819,12 +830,28 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
struct task_struct *task, u64 addr)
{
struct bpf_iter_task_vma_kern *kit = (void *)it;
- bool irq_work_busy = false;
int err;
BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+ if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) {
+ kit->data = NULL;
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Reject irqs-disabled contexts including NMI. Operations used
+ * by _next() and _destroy() (vma_end_read, fput, bpf_iter_mmput_async)
+ * can take spinlocks with IRQs disabled (pi_lock, pool->lock).
+ * Running from NMI or from a tracepoint that fires with those
+ * locks held could deadlock.
+ */
+ if (irqs_disabled()) {
+ kit->data = NULL;
+ return -EBUSY;
+ }
+
/* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
* before, so non-NULL kit->data doesn't point to previously
* bpf_mem_alloc'd bpf_iter_task_vma_kern_data
@@ -834,38 +861,131 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
return -ENOMEM;
kit->data->task = get_task_struct(task);
+ /*
+ * Safely read task->mm and acquire an mm reference.
+ *
+ * Cannot use get_task_mm() because its task_lock() is a
+ * blocking spin_lock that would deadlock if the target task
+ * already holds alloc_lock on this CPU (e.g. a softirq BPF
+ * program iterating a task interrupted while holding its
+ * alloc_lock).
+ */
+ if (!spin_trylock(&task->alloc_lock)) {
+ err = -EBUSY;
+ goto err_cleanup_iter;
+ }
kit->data->mm = task->mm;
+ if (kit->data->mm && !(task->flags & PF_KTHREAD))
+ mmget(kit->data->mm);
+ else
+ kit->data->mm = NULL;
+ spin_unlock(&task->alloc_lock);
if (!kit->data->mm) {
err = -ENOENT;
goto err_cleanup_iter;
}
- /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
- irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
- if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
- err = -EBUSY;
- goto err_cleanup_iter;
- }
-
- vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ kit->data->snapshot.vm_file = NULL;
+ kit->data->next_addr = addr;
return 0;
err_cleanup_iter:
- if (kit->data->task)
- put_task_struct(kit->data->task);
+ put_task_struct(kit->data->task);
bpf_mem_free(&bpf_global_ma, kit->data);
/* NULL kit->data signals failed bpf_iter_task_vma initialization */
kit->data = NULL;
return err;
}
+/*
+ * Find and lock the next VMA at or after data->next_addr.
+ *
+ * lock_vma_under_rcu() is a point lookup (mas_walk): it finds the VMA
+ * containing a given address but cannot iterate. An RCU-protected
+ * maple tree walk with vma_next() (mas_find) is needed first to locate
+ * the next VMA's vm_start across any gap.
+ *
+ * Between the RCU walk and the lock, the VMA may be removed, shrunk,
+ * or write-locked. On failure, advance past it using vm_end from the
+ * RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall back
+ * to PAGE_SIZE advancement to guarantee forward progress.
+ */
+static struct vm_area_struct *
+bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
+{
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
+ unsigned long start, end;
+
+retry:
+ rcu_read_lock();
+ vma_iter_init(&vmi, data->mm, data->next_addr);
+ vma = vma_next(&vmi);
+ if (!vma) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ start = vma->vm_start;
+ end = vma->vm_end;
+ rcu_read_unlock();
+
+ vma = lock_vma_under_rcu(data->mm, start);
+ if (!vma) {
+ if (end <= data->next_addr)
+ data->next_addr += PAGE_SIZE;
+ else
+ data->next_addr = end;
+ goto retry;
+ }
+
+ if (unlikely(vma->vm_end <= data->next_addr)) {
+ data->next_addr += PAGE_SIZE;
+ vma_end_read(vma);
+ goto retry;
+ }
+
+ return vma;
+}
+
+static void bpf_iter_task_vma_snapshot_reset(struct vm_area_struct *snap)
+{
+ if (snap->vm_file) {
+ fput(snap->vm_file);
+ snap->vm_file = NULL;
+ }
+}
+
__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
{
struct bpf_iter_task_vma_kern *kit = (void *)it;
+ struct vm_area_struct *snap, *vma;
if (!kit->data) /* bpf_iter_task_vma_new failed */
return NULL;
- return vma_next(&kit->data->vmi);
+
+ snap = &kit->data->snapshot;
+
+ bpf_iter_task_vma_snapshot_reset(snap);
+
+ vma = bpf_iter_task_vma_find_next(kit->data);
+ if (!vma)
+ return NULL;
+
+ memcpy(snap, vma, sizeof(*snap));
+
+ /*
+ * The verifier only trusts vm_mm and vm_file (see
+ * BTF_TYPE_SAFE_TRUSTED_OR_NULL in verifier.c). Take a reference
+ * on vm_file; vm_mm is already correct because lock_vma_under_rcu()
+ * verifies vma->vm_mm == mm. All other pointers are untrusted by
+ * the verifier and left as-is.
+ */
+ if (snap->vm_file)
+ get_file(snap->vm_file);
+
+ kit->data->next_addr = vma->vm_end;
+ vma_end_read(vma);
+ return snap;
}
__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
@@ -873,8 +993,9 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
struct bpf_iter_task_vma_kern *kit = (void *)it;
if (kit->data) {
- bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ bpf_iter_task_vma_snapshot_reset(&kit->data->snapshot);
put_task_struct(kit->data->task);
+ bpf_iter_mmput_async(kit->data->mm);
bpf_mem_free(&bpf_global_ma, kit->data);
}
}
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 26fbfbb01700..ec9c310cf5d7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -269,3 +269,51 @@ struct tnum tnum_bswap64(struct tnum a)
{
return TNUM(swab64(a.value), swab64(a.mask));
}
+
+/* Given tnum t, and a number z such that tmin <= z < tmax, where tmin
+ * is the smallest member of the t (= t.value) and tmax is the largest
+ * member of t (= t.value | t.mask), returns the smallest member of t
+ * larger than z.
+ *
+ * For example,
+ * t = x11100x0
+ * z = 11110001 (241)
+ * result = 11110010 (242)
+ *
+ * Note: if this function is called with z >= tmax, it just returns
+ * early with tmax; if this function is called with z < tmin, the
+ * algorithm already returns tmin.
+ */
+u64 tnum_step(struct tnum t, u64 z)
+{
+ u64 tmax, d, carry_mask, filled, inc;
+
+ tmax = t.value | t.mask;
+
+ /* if z >= largest member of t, return largest member of t */
+ if (z >= tmax)
+ return tmax;
+
+ /* if z < smallest member of t, return smallest member of t */
+ if (z < t.value)
+ return t.value;
+
+ /*
+ * Let r be the result tnum member, z = t.value + d.
+ * Every tnum member is t.value | s for some submask s of t.mask,
+ * and since t.value & t.mask == 0, t.value | s == t.value + s.
+ * So r > z becomes s > d where d = z - t.value.
+ *
+ * Find the smallest submask s of t.mask greater than d by
+ * "incrementing d within the mask": fill every non-mask
+ * position with 1 (`filled`) so +1 ripples through the gaps,
+ * then keep only mask bits. `carry_mask` additionally fills
+ * positions below the highest non-mask 1 in d, preventing
+ * it from trapping the carry.
+ */
+ d = z - t.value;
+ carry_mask = (1ULL << fls64(d & ~t.mask)) - 1;
+ filled = d | carry_mask | ~t.mask;
+ inc = (filled + 1) & t.mask;
+ return t.value | inc;
+}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 84db9e658e52..f02254a21585 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -1002,10 +1002,8 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
mutex_lock(&tr->mutex);
shim_link = cgroup_shim_find(tr, bpf_func);
- if (shim_link) {
+ if (shim_link && !IS_ERR(bpf_link_inc_not_zero(&shim_link->link.link))) {
/* Reusing existing shim attached by the other program. */
- bpf_link_inc(&shim_link->link.link);
-
mutex_unlock(&tr->mutex);
bpf_trampoline_put(tr); /* bpf_trampoline_get above */
return 0;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb12ba020649..69d75515ed3f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -195,9 +195,6 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
#define BPF_COMPLEXITY_LIMIT_STATES 64
-#define BPF_MAP_KEY_POISON (1ULL << 63)
-#define BPF_MAP_KEY_SEEN (1ULL << 62)
-
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
#define BPF_PRIV_STACK_MIN_SIZE 64
@@ -210,16 +207,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
static bool is_trusted_reg(const struct bpf_reg_state *reg);
-
-static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
-{
- return aux->map_ptr_state.poison;
-}
-
-static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
-{
- return aux->map_ptr_state.unpriv;
-}
+static inline bool in_sleepable_context(struct bpf_verifier_env *env);
+static const char *non_sleepable_context_description(struct bpf_verifier_env *env);
+static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg);
+static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg);
static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
struct bpf_map *map,
@@ -231,21 +222,6 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
aux->map_ptr_state.map_ptr = map;
}
-static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
-{
- return aux->map_key_state & BPF_MAP_KEY_POISON;
-}
-
-static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
-{
- return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
-}
-
-static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
-{
- return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
-}
-
static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
{
bool poisoned = bpf_map_key_poisoned(aux);
@@ -254,29 +230,6 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
-static bool bpf_helper_call(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == 0;
-}
-
-static bool bpf_pseudo_call(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == BPF_PSEUDO_CALL;
-}
-
-static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
-}
-
-struct bpf_map_desc {
- struct bpf_map *ptr;
- int uid;
-};
-
struct bpf_call_arg_meta {
struct bpf_map_desc map;
bool raw_mode;
@@ -306,59 +259,6 @@ struct bpf_kfunc_meta {
s32 id;
};
-struct bpf_kfunc_call_arg_meta {
- /* In parameters */
- struct btf *btf;
- u32 func_id;
- u32 kfunc_flags;
- const struct btf_type *func_proto;
- const char *func_name;
- /* Out parameters */
- u32 ref_obj_id;
- u8 release_regno;
- bool r0_rdonly;
- u32 ret_btf_id;
- u64 r0_size;
- u32 subprogno;
- struct {
- u64 value;
- bool found;
- } arg_constant;
-
- /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
- * generally to pass info about user-defined local kptr types to later
- * verification logic
- * bpf_obj_drop/bpf_percpu_obj_drop
- * Record the local kptr type to be drop'd
- * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
- * Record the local kptr type to be refcount_incr'd and use
- * arg_owning_ref to determine whether refcount_acquire should be
- * fallible
- */
- struct btf *arg_btf;
- u32 arg_btf_id;
- bool arg_owning_ref;
- bool arg_prog;
-
- struct {
- struct btf_field *field;
- } arg_list_head;
- struct {
- struct btf_field *field;
- } arg_rbtree_root;
- struct {
- enum bpf_dynptr_type type;
- u32 id;
- u32 ref_obj_id;
- } initialized_dynptr;
- struct {
- u8 spi;
- u8 frameno;
- } iter;
- struct bpf_map_desc map;
- u64 mem_size;
-};
-
struct btf *btf_vmlinux;
static const char *btf_type_name(const struct btf *btf, u32 id)
@@ -437,13 +337,36 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
return rec;
}
-static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
+bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog)
{
struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
}
+static bool subprog_returns_void(struct bpf_verifier_env *env, int subprog)
+{
+ const struct btf_type *type, *func, *func_proto;
+ const struct btf *btf = env->prog->aux->btf;
+ u32 btf_id;
+
+ btf_id = env->prog->aux->func_info[subprog].type_id;
+
+ func = btf_type_by_id(btf, btf_id);
+ if (verifier_bug_if(!func, env, "btf_id %u not found", btf_id))
+ return false;
+
+ func_proto = btf_type_by_id(btf, func->type);
+ if (!func_proto)
+ return false;
+
+ type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ if (!type)
+ return false;
+
+ return btf_type_is_void(type);
+}
+
static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
{
struct bpf_func_info *info;
@@ -455,7 +378,7 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
return btf_type_name(env->prog->aux->btf, info->type_id);
}
-static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
+void bpf_mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
{
struct bpf_subprog_info *info = subprog_info(env, subprog);
@@ -543,13 +466,13 @@ static bool is_callback_calling_function(enum bpf_func_id func_id)
is_async_callback_calling_function(func_id);
}
-static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+bool bpf_is_sync_callback_calling_insn(struct bpf_insn *insn)
{
return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
(bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
}
-static bool is_async_callback_calling_insn(struct bpf_insn *insn)
+bool bpf_is_async_callback_calling_insn(struct bpf_insn *insn)
{
return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
(bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
@@ -570,24 +493,11 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn
return false;
}
-static bool is_may_goto_insn(struct bpf_insn *insn)
+bool bpf_is_may_goto_insn(struct bpf_insn *insn)
{
return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
}
-static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
-{
- return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
-}
-
-static bool is_storage_get_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_sk_storage_get ||
- func_id == BPF_FUNC_inode_storage_get ||
- func_id == BPF_FUNC_task_storage_get ||
- func_id == BPF_FUNC_cgrp_storage_get;
-}
-
static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -603,32 +513,6 @@ static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
return ref_obj_uses > 1;
}
-static bool is_cmpxchg_insn(const struct bpf_insn *insn)
-{
- return BPF_CLASS(insn->code) == BPF_STX &&
- BPF_MODE(insn->code) == BPF_ATOMIC &&
- insn->imm == BPF_CMPXCHG;
-}
-
-static bool is_atomic_load_insn(const struct bpf_insn *insn)
-{
- return BPF_CLASS(insn->code) == BPF_STX &&
- BPF_MODE(insn->code) == BPF_ATOMIC &&
- insn->imm == BPF_LOAD_ACQ;
-}
-
-static int __get_spi(s32 off)
-{
- return (-off - 1) / BPF_REG_SIZE;
-}
-
-static struct bpf_func_state *func(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg)
-{
- struct bpf_verifier_state *cur = env->cur_state;
-
- return cur->frame[reg->frameno];
-}
static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
@@ -654,19 +538,19 @@ static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_s
return -EINVAL;
}
- off = reg->off + reg->var_off.value;
+ off = reg->var_off.value;
if (off % BPF_REG_SIZE) {
verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
return -EINVAL;
}
- spi = __get_spi(off);
+ spi = bpf_get_spi(off);
if (spi + 1 < nr_slots) {
verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
return -EINVAL;
}
- if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
+ if (!is_spi_bounds_valid(bpf_func(env, reg), spi, nr_slots))
return -ERANGE;
return spi;
}
@@ -735,8 +619,6 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg,
enum bpf_dynptr_type type,
bool first_slot, int dynptr_id);
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg);
static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *sreg1,
@@ -762,7 +644,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
enum bpf_dynptr_type type;
int spi, i, err;
@@ -814,8 +696,6 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
- bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
-
return 0;
}
@@ -828,15 +708,13 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat
state->stack[spi - 1].slot_type[i] = STACK_INVALID;
}
- __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
- __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
-
- bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+ bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
}
static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, ref_obj_id, i;
/*
@@ -895,7 +773,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
if (!env->allow_ptr_leaks)
- __mark_reg_not_init(env, reg);
+ bpf_mark_reg_not_init(env, reg);
else
__mark_reg_unknown(env, reg);
}
@@ -920,8 +798,27 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
spi = spi + 1;
if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
- verbose(env, "cannot overwrite referenced dynptr\n");
- return -EINVAL;
+ int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
+ int ref_cnt = 0;
+
+ /*
+ * A referenced dynptr can be overwritten only if there is at
+ * least one other dynptr sharing the same ref_obj_id,
+ * ensuring the reference can still be properly released.
+ */
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_DYNPTR)
+ continue;
+ if (!state->stack[i].spilled_ptr.dynptr.first_slot)
+ continue;
+ if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id)
+ ref_cnt++;
+ }
+
+ if (ref_cnt <= 1) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
}
mark_stack_slot_scratched(env, spi);
@@ -946,10 +843,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
/* Do not release reference state, we are destroying dynptr on stack,
* not using some helper to release it. Just reset register.
*/
- __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
- __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
-
- bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+ bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
return 0;
}
@@ -984,7 +879,7 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int i, spi;
/* This already represents first slot of initialized bpf_dynptr.
@@ -1014,7 +909,7 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re
static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
enum bpf_dynptr_type dynptr_type;
int spi;
@@ -1044,7 +939,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int insn_idx,
struct btf *btf, u32 btf_id, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, i, j, id;
spi = iter_get_spi(env, reg, nr_slots);
@@ -1076,7 +971,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
for (j = 0; j < BPF_REG_SIZE; j++)
slot->slot_type[j] = STACK_ITER;
- bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
mark_stack_slot_scratched(env, spi - i);
}
@@ -1086,7 +980,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, i, j;
spi = iter_get_spi(env, reg, nr_slots);
@@ -1100,12 +994,11 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
if (i == 0)
WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
- __mark_reg_not_init(env, st);
+ bpf_mark_reg_not_init(env, st);
for (j = 0; j < BPF_REG_SIZE; j++)
slot->slot_type[j] = STACK_INVALID;
- bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
mark_stack_slot_scratched(env, spi - i);
}
@@ -1115,7 +1008,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, i, j;
/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
@@ -1142,7 +1035,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
struct btf *btf, u32 btf_id, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, i, j;
spi = iter_get_spi(env, reg, nr_slots);
@@ -1179,7 +1072,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int insn_idx,
int kfunc_class)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
struct bpf_stack_state *slot;
struct bpf_reg_state *st;
int spi, i, id;
@@ -1195,7 +1088,6 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
slot = &state->stack[spi];
st = &slot->spilled_ptr;
- bpf_mark_stack_write(env, reg->frameno, BIT(spi));
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
st->ref_obj_id = id;
@@ -1211,7 +1103,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int kfunc_class)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
struct bpf_stack_state *slot;
struct bpf_reg_state *st;
int spi, i, err;
@@ -1249,9 +1141,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
return err;
}
- __mark_reg_not_init(env, st);
-
- bpf_mark_stack_write(env, reg->frameno, BIT(spi));
+ bpf_mark_reg_not_init(env, st);
for (i = 0; i < BPF_REG_SIZE; i++)
slot->slot_type[i] = STACK_INVALID;
@@ -1262,7 +1152,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
struct bpf_stack_state *slot;
int spi, i;
@@ -1286,7 +1176,7 @@ static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bp
static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
struct bpf_stack_state *slot;
struct bpf_reg_state *st;
int spi, i;
@@ -1324,6 +1214,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack)
case STACK_IRQ_FLAG:
return true;
case STACK_INVALID:
+ case STACK_POISON:
case STACK_MISC:
case STACK_ZERO:
return false;
@@ -1336,26 +1227,12 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack)
/* The reg state of a pointer or a bounded scalar was saved when
* it was spilled to the stack.
*/
-static bool is_spilled_reg(const struct bpf_stack_state *stack)
-{
- return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
-}
-static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
-{
- return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
- stack->spilled_ptr.type == SCALAR_VALUE;
-}
-
-static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
-{
- return stack->slot_type[0] == STACK_SPILL &&
- stack->spilled_ptr.type == SCALAR_VALUE;
-}
-
-/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
- * case they are equivalent, or it's STACK_ZERO, in which case we preserve
- * more precise STACK_ZERO.
+/*
+ * Mark stack slot as STACK_MISC, unless it is already:
+ * - STACK_INVALID, in which case they are equivalent.
+ * - STACK_ZERO, in which case we preserve more precise STACK_ZERO.
+ * - STACK_POISON, which truly forbids access to the slot.
* Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
* mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
* unnecessary as both are considered equivalent when loading data and pruning,
@@ -1366,14 +1243,14 @@ static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
if (*stype == STACK_ZERO)
return;
- if (*stype == STACK_INVALID)
+ if (*stype == STACK_INVALID || *stype == STACK_POISON)
return;
*stype = STACK_MISC;
}
static void scrub_spilled_slot(u8 *stype)
{
- if (*stype != STACK_INVALID)
+ if (*stype != STACK_INVALID && *stype != STACK_POISON)
*stype = STACK_MISC;
}
@@ -1662,14 +1539,6 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st
return NULL;
}
-static void update_peak_states(struct bpf_verifier_env *env)
-{
- u32 cur_states;
-
- cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
- env->peak_states = max(env->peak_states, cur_states);
-}
-
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
@@ -1678,15 +1547,15 @@ static void free_func_state(struct bpf_func_state *state)
kfree(state);
}
-static void clear_jmp_history(struct bpf_verifier_state *state)
+void bpf_clear_jmp_history(struct bpf_verifier_state *state)
{
kfree(state->jmp_history);
state->jmp_history = NULL;
state->jmp_history_cnt = 0;
}
-static void free_verifier_state(struct bpf_verifier_state *state,
- bool free_self)
+void bpf_free_verifier_state(struct bpf_verifier_state *state,
+ bool free_self)
{
int i;
@@ -1695,42 +1564,11 @@ static void free_verifier_state(struct bpf_verifier_state *state,
state->frame[i] = NULL;
}
kfree(state->refs);
- clear_jmp_history(state);
+ bpf_clear_jmp_history(state);
if (free_self)
kfree(state);
}
-/* struct bpf_verifier_state->parent refers to states
- * that are in either of env->{expored_states,free_list}.
- * In both cases the state is contained in struct bpf_verifier_state_list.
- */
-static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
-{
- if (st->parent)
- return container_of(st->parent, struct bpf_verifier_state_list, state);
- return NULL;
-}
-
-static bool incomplete_read_marks(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st);
-
-/* A state can be freed if it is no longer referenced:
- * - is in the env->free_list;
- * - has no children states;
- */
-static void maybe_free_verifier_state(struct bpf_verifier_env *env,
- struct bpf_verifier_state_list *sl)
-{
- if (!sl->in_free_list
- || sl->state.branches != 0
- || incomplete_read_marks(env, &sl->state))
- return;
- list_del(&sl->node);
- free_verifier_state(&sl->state, false);
- kfree(sl);
- env->free_list_size--;
-}
-
/* copy verifier state from src to dst growing dst stack space
* when necessary to accommodate larger src stack
*/
@@ -1741,8 +1579,8 @@ static int copy_func_state(struct bpf_func_state *dst,
return copy_stack_state(dst, src);
}
-static int copy_verifier_state(struct bpf_verifier_state *dst_state,
- const struct bpf_verifier_state *src)
+int bpf_copy_verifier_state(struct bpf_verifier_state *dst_state,
+ const struct bpf_verifier_state *src)
{
struct bpf_func_state *dst;
int i, err;
@@ -1766,7 +1604,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return err;
dst_state->speculative = src->speculative;
dst_state->in_sleepable = src->in_sleepable;
- dst_state->cleaned = src->cleaned;
dst_state->curframe = src->curframe;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
@@ -1796,7 +1633,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env)
return env->prog->len;
}
-static struct list_head *explored_state(struct bpf_verifier_env *env, int idx)
+struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_func_state *state = cur->frame[cur->curframe];
@@ -1818,266 +1655,19 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
return true;
}
-/* Return IP for a given frame in a call stack */
-static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
-{
- return frame == st->curframe
- ? st->insn_idx
- : st->frame[frame + 1]->callsite;
-}
-
-/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
- * if such frame exists form a corresponding @callchain as an array of
- * call sites leading to this frame and SCC id.
- * E.g.:
- *
- * void foo() { A: loop {... SCC#1 ...}; }
- * void bar() { B: loop { C: foo(); ... SCC#2 ... }
- * D: loop { E: foo(); ... SCC#3 ... } }
- * void main() { F: bar(); }
- *
- * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
- * on @st frame call sites being (F,C,A) or (F,E,A).
- */
-static bool compute_scc_callchain(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st,
- struct bpf_scc_callchain *callchain)
-{
- u32 i, scc, insn_idx;
-
- memset(callchain, 0, sizeof(*callchain));
- for (i = 0; i <= st->curframe; i++) {
- insn_idx = frame_insn_idx(st, i);
- scc = env->insn_aux_data[insn_idx].scc;
- if (scc) {
- callchain->scc = scc;
- break;
- } else if (i < st->curframe) {
- callchain->callsites[i] = insn_idx;
- } else {
- return false;
- }
- }
- return true;
-}
-
-/* Check if bpf_scc_visit instance for @callchain exists. */
-static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
- struct bpf_scc_callchain *callchain)
-{
- struct bpf_scc_info *info = env->scc_info[callchain->scc];
- struct bpf_scc_visit *visits = info->visits;
- u32 i;
-
- if (!info)
- return NULL;
- for (i = 0; i < info->num_visits; i++)
- if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
- return &visits[i];
- return NULL;
-}
-
-/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
- * Allocated instances are alive for a duration of the do_check_common()
- * call and are freed by free_states().
- */
-static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
- struct bpf_scc_callchain *callchain)
-{
- struct bpf_scc_visit *visit;
- struct bpf_scc_info *info;
- u32 scc, num_visits;
- u64 new_sz;
-
- scc = callchain->scc;
- info = env->scc_info[scc];
- num_visits = info ? info->num_visits : 0;
- new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
- info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT);
- if (!info)
- return NULL;
- env->scc_info[scc] = info;
- info->num_visits = num_visits + 1;
- visit = &info->visits[num_visits];
- memset(visit, 0, sizeof(*visit));
- memcpy(&visit->callchain, callchain, sizeof(*callchain));
- return visit;
-}
-
-/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
-static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
-{
- char *buf = env->tmp_str_buf;
- int i, delta = 0;
-
- delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
- for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
- if (!callchain->callsites[i])
- break;
- delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
- callchain->callsites[i]);
- }
- delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
- return env->tmp_str_buf;
-}
-
-/* If callchain for @st exists (@st is in some SCC), ensure that
- * bpf_scc_visit instance for this callchain exists.
- * If instance does not exist or is empty, assign visit->entry_state to @st.
- */
-static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_scc_callchain *callchain = &env->callchain_buf;
- struct bpf_scc_visit *visit;
-
- if (!compute_scc_callchain(env, st, callchain))
- return 0;
- visit = scc_visit_lookup(env, callchain);
- visit = visit ?: scc_visit_alloc(env, callchain);
- if (!visit)
- return -ENOMEM;
- if (!visit->entry_state) {
- visit->entry_state = st;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
- }
- return 0;
-}
-
-static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);
-
-/* If callchain for @st exists (@st is in some SCC), make it empty:
- * - set visit->entry_state to NULL;
- * - flush accumulated backedges.
- */
-static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_scc_callchain *callchain = &env->callchain_buf;
- struct bpf_scc_visit *visit;
-
- if (!compute_scc_callchain(env, st, callchain))
- return 0;
- visit = scc_visit_lookup(env, callchain);
- if (!visit) {
- /*
- * If path traversal stops inside an SCC, corresponding bpf_scc_visit
- * must exist for non-speculative paths. For non-speculative paths
- * traversal stops when:
- * a. Verification error is found, maybe_exit_scc() is not called.
- * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
- * of any SCC.
- * c. A checkpoint is reached and matched. Checkpoints are created by
- * is_state_visited(), which calls maybe_enter_scc(), which allocates
- * bpf_scc_visit instances for checkpoints within SCCs.
- * (c) is the only case that can reach this point.
- */
- if (!st->speculative) {
- verifier_bug(env, "scc exit: no visit info for call chain %s",
- format_callchain(env, callchain));
- return -EFAULT;
- }
- return 0;
- }
- if (visit->entry_state != st)
- return 0;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
- visit->entry_state = NULL;
- env->num_backedges -= visit->num_backedges;
- visit->num_backedges = 0;
- update_peak_states(env);
- return propagate_backedges(env, visit);
-}
-
-/* Lookup an bpf_scc_visit instance corresponding to @st callchain
- * and add @backedge to visit->backedges. @st callchain must exist.
- */
-static int add_scc_backedge(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st,
- struct bpf_scc_backedge *backedge)
-{
- struct bpf_scc_callchain *callchain = &env->callchain_buf;
- struct bpf_scc_visit *visit;
-
- if (!compute_scc_callchain(env, st, callchain)) {
- verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
- st->insn_idx);
- return -EFAULT;
- }
- visit = scc_visit_lookup(env, callchain);
- if (!visit) {
- verifier_bug(env, "add backedge: no visit info for call chain %s",
- format_callchain(env, callchain));
- return -EFAULT;
- }
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
- backedge->next = visit->backedges;
- visit->backedges = backedge;
- visit->num_backedges++;
- env->num_backedges++;
- update_peak_states(env);
- return 0;
-}
-
-/* bpf_reg_state->live marks for registers in a state @st are incomplete,
- * if state @st is in some SCC and not all execution paths starting at this
- * SCC are fully explored.
- */
-static bool incomplete_read_marks(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
-{
- struct bpf_scc_callchain *callchain = &env->callchain_buf;
- struct bpf_scc_visit *visit;
- if (!compute_scc_callchain(env, st, callchain))
- return false;
- visit = scc_visit_lookup(env, callchain);
- if (!visit)
- return false;
- return !!visit->backedges;
-}
-
-static void free_backedges(struct bpf_scc_visit *visit)
+void bpf_free_backedges(struct bpf_scc_visit *visit)
{
struct bpf_scc_backedge *backedge, *next;
for (backedge = visit->backedges; backedge; backedge = next) {
- free_verifier_state(&backedge->state, false);
+ bpf_free_verifier_state(&backedge->state, false);
next = backedge->next;
kfree(backedge);
}
visit->backedges = NULL;
}
-static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_verifier_state_list *sl = NULL, *parent_sl;
- struct bpf_verifier_state *parent;
- int err;
-
- while (st) {
- u32 br = --st->branches;
-
- /* verifier_bug_if(br > 1, ...) technically makes sense here,
- * but see comment in push_stack(), hence:
- */
- verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br);
- if (br)
- break;
- err = maybe_exit_scc(env, st);
- if (err)
- return err;
- parent = st->parent;
- parent_sl = state_parent_as_list(st);
- if (sl)
- maybe_free_verifier_state(env, sl);
- st = parent;
- sl = parent_sl;
- }
- return 0;
-}
-
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
int *insn_idx, bool pop_log)
{
@@ -2089,7 +1679,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
return -ENOENT;
if (cur) {
- err = copy_verifier_state(cur, &head->st);
+ err = bpf_copy_verifier_state(cur, &head->st);
if (err)
return err;
}
@@ -2100,7 +1690,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
if (prev_insn_idx)
*prev_insn_idx = head->prev_insn_idx;
elem = head->next;
- free_verifier_state(&head->st, false);
+ bpf_free_verifier_state(&head->st, false);
kfree(head);
env->head = elem;
env->stack_size--;
@@ -2137,7 +1727,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
- err = copy_verifier_state(&elem->st, cur);
+ err = bpf_copy_verifier_state(&elem->st, cur);
if (err)
return ERR_PTR(-ENOMEM);
elem->st.speculative |= speculative;
@@ -2161,7 +1751,6 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
return &elem->st;
}
-#define CALLER_SAVED_REGS 6
static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
@@ -2224,13 +1813,6 @@ static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf
static void mark_reg_known_zero(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
- if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
- /* Something bad happened, let's kill all regs */
- for (regno = 0; regno < MAX_BPF_REG; regno++)
- __mark_reg_not_init(env, regs + regno);
- return;
- }
__mark_reg_known_zero(regs + regno);
}
@@ -2281,11 +1863,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
struct btf_field_graph_root *ds_head)
{
- __mark_reg_known_zero(&regs[regno]);
+ __mark_reg_known(&regs[regno], ds_head->node_offset);
regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
regs[regno].btf = ds_head->btf;
regs[regno].btf_id = ds_head->value_btf_id;
- regs[regno].off = ds_head->node_offset;
}
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
@@ -2316,7 +1897,6 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
*/
return reg->type == which &&
reg->id == 0 &&
- reg->off == 0 &&
tnum_equals_const(reg->var_off, 0);
}
@@ -2379,6 +1959,9 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg)
static void __update_reg64_bounds(struct bpf_reg_state *reg)
{
+ u64 tnum_next, tmax;
+ bool umin_in_tnum;
+
/* min signed is max(sign bit) | min(other bits) */
reg->smin_value = max_t(s64, reg->smin_value,
reg->var_off.value | (reg->var_off.mask & S64_MIN));
@@ -2388,6 +1971,33 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg)
reg->umin_value = max(reg->umin_value, reg->var_off.value);
reg->umax_value = min(reg->umax_value,
reg->var_off.value | reg->var_off.mask);
+
+ /* Check if u64 and tnum overlap in a single value */
+ tnum_next = tnum_step(reg->var_off, reg->umin_value);
+ umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value;
+ tmax = reg->var_off.value | reg->var_off.mask;
+ if (umin_in_tnum && tnum_next > reg->umax_value) {
+ /* The u64 range and the tnum only overlap in umin.
+ * u64: ---[xxxxxx]-----
+ * tnum: --xx----------x-
+ */
+ ___mark_reg_known(reg, reg->umin_value);
+ } else if (!umin_in_tnum && tnum_next == tmax) {
+ /* The u64 range and the tnum only overlap in the maximum value
+ * represented by the tnum, called tmax.
+ * u64: ---[xxxxxx]-----
+ * tnum: xx-----x--------
+ */
+ ___mark_reg_known(reg, tmax);
+ } else if (!umin_in_tnum && tnum_next <= reg->umax_value &&
+ tnum_step(reg->var_off, tnum_next) > reg->umax_value) {
+ /* The u64 range and the tnum only overlap in between umin
+ * (excluded) and umax.
+ * u64: ---[xxxxxx]-----
+ * tnum: xx----x-------x-
+ */
+ ___mark_reg_known(reg, tnum_next);
+ }
}
static void __update_reg_bounds(struct bpf_reg_state *reg)
@@ -2397,7 +2007,7 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
}
/* Uses signed min/max values to inform unsigned, and vice-versa */
-static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
+static void deduce_bounds_32_from_64(struct bpf_reg_state *reg)
{
/* If upper 32 bits of u64/s64 range don't change, we can use lower 32
* bits to improve our u32/s32 boundaries.
@@ -2467,6 +2077,10 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
}
+}
+
+static void deduce_bounds_32_from_32(struct bpf_reg_state *reg)
+{
/* if u32 range forms a valid s32 range (due to matching sign bit),
* try to learn from that
*/
@@ -2481,10 +2095,34 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
+ } else {
+ if (reg->u32_max_value < (u32)reg->s32_min_value) {
+ /* See __reg64_deduce_bounds() for detailed explanation.
+ * Refine ranges in the following situation:
+ *
+ * 0 U32_MAX
+ * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxx s32 range xxxxxxxxx] [xxxxxxx|
+ * 0 S32_MAX S32_MIN -1
+ */
+ reg->s32_min_value = (s32)reg->u32_min_value;
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value);
+ } else if ((u32)reg->s32_max_value < reg->u32_min_value) {
+ /*
+ * 0 U32_MAX
+ * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxxxxxx] [xxxxxxxxxxxx s32 range |
+ * 0 S32_MAX S32_MIN -1
+ */
+ reg->s32_max_value = (s32)reg->u32_max_value;
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value);
+ }
}
}
-static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
+static void deduce_bounds_64_from_64(struct bpf_reg_state *reg)
{
/* If u64 range forms a valid s64 range (due to matching sign bit),
* try to learn from that. Let's do a bit of ASCII art to see when
@@ -2619,7 +2257,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
}
}
-static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
+static void deduce_bounds_64_from_32(struct bpf_reg_state *reg)
{
/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
* values on both sides of 64-bit range in hope to have tighter range.
@@ -2688,9 +2326,10 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
- __reg32_deduce_bounds(reg);
- __reg64_deduce_bounds(reg);
- __reg_deduce_mixed_bounds(reg);
+ deduce_bounds_64_from_64(reg);
+ deduce_bounds_32_from_64(reg);
+ deduce_bounds_32_from_32(reg);
+ deduce_bounds_64_from_32(reg);
}
/* Attempts to improve var_off based on unsigned min/max information */
@@ -2706,14 +2345,18 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static bool range_bounds_violation(struct bpf_reg_state *reg);
+
static void reg_bounds_sync(struct bpf_reg_state *reg)
{
+ /* If the input reg_state is invalid, we can exit early */
+ if (range_bounds_violation(reg))
+ return;
/* We might have learned new bounds from the var_off. */
__update_reg_bounds(reg);
/* We might have learned something about the sign bit. */
__reg_deduce_bounds(reg);
__reg_deduce_bounds(reg);
- __reg_deduce_bounds(reg);
/* We might have learned some bits from the bounds. */
__reg_bound_offset(reg);
/* Intersecting with the old var_off might have improved our bounds
@@ -2723,39 +2366,55 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
__update_reg_bounds(reg);
}
+static bool range_bounds_violation(struct bpf_reg_state *reg)
+{
+ return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value ||
+ reg->u32_min_value > reg->u32_max_value ||
+ reg->s32_min_value > reg->s32_max_value);
+}
+
+static bool const_tnum_range_mismatch(struct bpf_reg_state *reg)
+{
+ u64 uval = reg->var_off.value;
+ s64 sval = (s64)uval;
+
+ if (!tnum_is_const(reg->var_off))
+ return false;
+
+ return reg->umin_value != uval || reg->umax_value != uval ||
+ reg->smin_value != sval || reg->smax_value != sval;
+}
+
+static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg)
+{
+ u32 uval32 = tnum_subreg(reg->var_off).value;
+ s32 sval32 = (s32)uval32;
+
+ if (!tnum_subreg_is_const(reg->var_off))
+ return false;
+
+ return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
+ reg->s32_min_value != sval32 || reg->s32_max_value != sval32;
+}
+
static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, const char *ctx)
{
const char *msg;
- if (reg->umin_value > reg->umax_value ||
- reg->smin_value > reg->smax_value ||
- reg->u32_min_value > reg->u32_max_value ||
- reg->s32_min_value > reg->s32_max_value) {
- msg = "range bounds violation";
- goto out;
+ if (range_bounds_violation(reg)) {
+ msg = "range bounds violation";
+ goto out;
}
- if (tnum_is_const(reg->var_off)) {
- u64 uval = reg->var_off.value;
- s64 sval = (s64)uval;
-
- if (reg->umin_value != uval || reg->umax_value != uval ||
- reg->smin_value != sval || reg->smax_value != sval) {
- msg = "const tnum out of sync with range bounds";
- goto out;
- }
+ if (const_tnum_range_mismatch(reg)) {
+ msg = "const tnum out of sync with range bounds";
+ goto out;
}
- if (tnum_subreg_is_const(reg->var_off)) {
- u32 uval32 = tnum_subreg(reg->var_off).value;
- s32 sval32 = (s32)uval32;
-
- if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
- reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
- msg = "const subreg tnum out of sync with range bounds";
- goto out;
- }
+ if (const_tnum_range_mismatch_32(reg)) {
+ msg = "const subreg tnum out of sync with range bounds";
+ goto out;
}
return 0;
@@ -2798,7 +2457,7 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
}
/* Mark a register as having a completely unknown (scalar) value. */
-static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
+void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{
/*
* Clear type, off, and union(map_ptr, range) and
@@ -2820,20 +2479,13 @@ static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
- __mark_reg_unknown_imprecise(reg);
+ bpf_mark_reg_unknown_imprecise(reg);
reg->precise = !env->bpf_capable;
}
static void mark_reg_unknown(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
- if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
- /* Something bad happened, let's kill all regs except FP */
- for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(env, regs + regno);
- return;
- }
__mark_reg_unknown(env, regs + regno);
}
@@ -2856,26 +2508,13 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env,
return reg_bounds_sanity_check(env, reg, "s32_range");
}
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg)
+void bpf_mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
__mark_reg_unknown(env, reg);
reg->type = NOT_INIT;
}
-static void mark_reg_not_init(struct bpf_verifier_env *env,
- struct bpf_reg_state *regs, u32 regno)
-{
- if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
- /* Something bad happened, let's kill all regs except FP */
- for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(env, regs + regno);
- return;
- }
- __mark_reg_not_init(env, regs + regno);
-}
-
static int mark_btf_ld_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno,
enum bpf_reg_type reg_type,
@@ -2913,7 +2552,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- mark_reg_not_init(env, regs, i);
+ bpf_mark_reg_not_init(env, &regs[i]);
regs[i].subreg_def = DEF_NOT_SUBREG;
}
@@ -2925,10 +2564,13 @@ static void init_reg_state(struct bpf_verifier_env *env,
static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
{
- return (struct bpf_retval_range){ minval, maxval };
+ /*
+ * return_32bit is set to false by default and set explicitly
+ * by the caller when necessary.
+ */
+ return (struct bpf_retval_range){ minval, maxval, false };
}
-#define BPF_MAIN_FUNC (-1)
static void init_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *state,
int callsite, int frameno, int subprogno)
@@ -2965,7 +2607,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
env->stack_size);
return ERR_PTR(-E2BIG);
}
- /* Unlike push_stack() do not copy_verifier_state().
+ /* Unlike push_stack() do not bpf_copy_verifier_state().
* The caller state doesn't matter.
* This is async callback. It starts in a fresh stack.
* Initialize it similar to do_check_common().
@@ -2984,12 +2626,6 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
}
-enum reg_arg_type {
- SRC_OP, /* register is used as source operand */
- DST_OP, /* register is used as destination operand */
- DST_OP_NO_MARK /* same as above, check only, don't mark */
-};
-
static int cmp_subprogs(const void *a, const void *b)
{
return ((struct bpf_subprog_info *)a)->start -
@@ -3018,7 +2654,7 @@ struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *en
}
/* Find subprogram that starts exactly at 'off' */
-static int find_subprog(struct bpf_verifier_env *env, int off)
+int bpf_find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
@@ -3037,7 +2673,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
verbose(env, "call to invalid destination\n");
return -EINVAL;
}
- ret = find_subprog(env, off);
+ ret = bpf_find_subprog(env, off);
if (ret >= 0)
return ret;
if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
@@ -3113,41 +2749,19 @@ static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
return ret;
}
-#define MAX_KFUNC_DESCS 256
#define MAX_KFUNC_BTFS 256
-struct bpf_kfunc_desc {
- struct btf_func_model func_model;
- u32 func_id;
- s32 imm;
- u16 offset;
- unsigned long addr;
-};
-
struct bpf_kfunc_btf {
struct btf *btf;
struct module *module;
u16 offset;
};
-struct bpf_kfunc_desc_tab {
- /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
- * verification. JITs do lookups by bpf_insn, where func_id may not be
- * available, therefore at the end of verification do_misc_fixups()
- * sorts this by imm and offset.
- */
- struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
- u32 nr_descs;
-};
-
struct bpf_kfunc_btf_tab {
struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
u32 nr_descs;
};
-static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc,
- int insn_idx);
-
static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
@@ -3375,7 +2989,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
return 0;
}
-static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
+int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset)
{
struct bpf_kfunc_btf_tab *btf_tab;
struct btf_func_model func_model;
@@ -3470,95 +3084,11 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return 0;
}
-static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
-{
- const struct bpf_kfunc_desc *d0 = a;
- const struct bpf_kfunc_desc *d1 = b;
-
- if (d0->imm != d1->imm)
- return d0->imm < d1->imm ? -1 : 1;
- if (d0->offset != d1->offset)
- return d0->offset < d1->offset ? -1 : 1;
- return 0;
-}
-
-static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
-{
- unsigned long call_imm;
-
- if (bpf_jit_supports_far_kfunc_call()) {
- call_imm = desc->func_id;
- } else {
- call_imm = BPF_CALL_IMM(desc->addr);
- /* Check whether the relative offset overflows desc->imm */
- if ((unsigned long)(s32)call_imm != call_imm) {
- verbose(env, "address of kernel func_id %u is out of range\n",
- desc->func_id);
- return -EINVAL;
- }
- }
- desc->imm = call_imm;
- return 0;
-}
-
-static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
-{
- struct bpf_kfunc_desc_tab *tab;
- int i, err;
-
- tab = env->prog->aux->kfunc_tab;
- if (!tab)
- return 0;
-
- for (i = 0; i < tab->nr_descs; i++) {
- err = set_kfunc_desc_imm(env, &tab->descs[i]);
- if (err)
- return err;
- }
-
- sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_imm_off, NULL);
- return 0;
-}
-
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
{
return !!prog->aux->kfunc_tab;
}
-const struct btf_func_model *
-bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
- const struct bpf_insn *insn)
-{
- const struct bpf_kfunc_desc desc = {
- .imm = insn->imm,
- .offset = insn->off,
- };
- const struct bpf_kfunc_desc *res;
- struct bpf_kfunc_desc_tab *tab;
-
- tab = prog->aux->kfunc_tab;
- res = bsearch(&desc, tab->descs, tab->nr_descs,
- sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
-
- return res ? &res->func_model : NULL;
-}
-
-static int add_kfunc_in_insns(struct bpf_verifier_env *env,
- struct bpf_insn *insn, int cnt)
-{
- int i, ret;
-
- for (i = 0; i < cnt; i++, insn++) {
- if (bpf_pseudo_kfunc_call(insn)) {
- ret = add_kfunc_call(env, insn->imm, insn->off);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
@@ -3583,7 +3113,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
ret = add_subprog(env, i + insn->imm + 1);
else
- ret = add_kfunc_call(env, insn->imm, insn->off);
+ ret = bpf_add_kfunc_call(env, insn->imm, insn->off);
if (ret < 0)
return ret;
@@ -3605,7 +3135,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
if (env->subprog_info[i].start != ex_cb_insn)
continue;
env->exception_callback_subprog = i;
- mark_subprog_exc_cb(env, i);
+ bpf_mark_subprog_exc_cb(env, i);
break;
}
}
@@ -3678,17 +3208,101 @@ next:
return 0;
}
+/*
+ * Sort subprogs in topological order so that leaf subprogs come first and
+ * their callers come later. This is a DFS post-order traversal of the call
+ * graph. Scan only reachable instructions (those in the computed postorder) of
+ * the current subprog to discover callees (direct subprogs and sync
+ * callbacks).
+ */
+static int sort_subprogs_topo(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *si = env->subprog_info;
+ int *insn_postorder = env->cfg.insn_postorder;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int cnt = env->subprog_cnt;
+ int *dfs_stack = NULL;
+ int top = 0, order = 0;
+ int i, ret = 0;
+ u8 *color = NULL;
+
+ color = kvzalloc_objs(*color, cnt, GFP_KERNEL_ACCOUNT);
+ dfs_stack = kvmalloc_objs(*dfs_stack, cnt, GFP_KERNEL_ACCOUNT);
+ if (!color || !dfs_stack) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * DFS post-order traversal.
+ * Color values: 0 = unvisited, 1 = on stack, 2 = done.
+ */
+ for (i = 0; i < cnt; i++) {
+ if (color[i])
+ continue;
+ color[i] = 1;
+ dfs_stack[top++] = i;
+
+ while (top > 0) {
+ int cur = dfs_stack[top - 1];
+ int po_start = si[cur].postorder_start;
+ int po_end = si[cur + 1].postorder_start;
+ bool pushed = false;
+ int j;
+
+ for (j = po_start; j < po_end; j++) {
+ int idx = insn_postorder[j];
+ int callee;
+
+ if (!bpf_pseudo_call(&insn[idx]) && !bpf_pseudo_func(&insn[idx]))
+ continue;
+ callee = bpf_find_subprog(env, idx + insn[idx].imm + 1);
+ if (callee < 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (color[callee] == 2)
+ continue;
+ if (color[callee] == 1) {
+ if (bpf_pseudo_func(&insn[idx]))
+ continue;
+ verbose(env, "recursive call from %s() to %s()\n",
+ subprog_name(env, cur),
+ subprog_name(env, callee));
+ ret = -EINVAL;
+ goto out;
+ }
+ color[callee] = 1;
+ dfs_stack[top++] = callee;
+ pushed = true;
+ break;
+ }
+
+ if (!pushed) {
+ color[cur] = 2;
+ env->subprog_topo_order[order++] = cur;
+ top--;
+ }
+ }
+ }
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ for (i = 0; i < cnt; i++)
+ verbose(env, "topo_order[%d] = %s\n",
+ i, subprog_name(env, env->subprog_topo_order[i]));
+out:
+ kvfree(dfs_stack);
+ kvfree(color);
+ return ret;
+}
+
static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int spi, int nr_slots)
{
- int err, i;
+ int i;
- for (i = 0; i < nr_slots; i++) {
- err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i));
- if (err)
- return err;
+ for (i = 0; i < nr_slots; i++)
mark_stack_slot_scratched(env, spi - i);
- }
return 0;
}
@@ -3732,8 +3346,8 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
*/
-static bool is_reg64(struct bpf_insn *insn,
- u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+bool bpf_is_reg64(struct bpf_insn *insn,
+ u32 regno, struct bpf_reg_state *reg, enum bpf_reg_arg_type t)
{
u8 code, class, op;
@@ -3818,41 +3432,6 @@ static bool is_reg64(struct bpf_insn *insn,
return true;
}
-/* Return the regno defined by the insn, or -1. */
-static int insn_def_regno(const struct bpf_insn *insn)
-{
- switch (BPF_CLASS(insn->code)) {
- case BPF_JMP:
- case BPF_JMP32:
- case BPF_ST:
- return -1;
- case BPF_STX:
- if (BPF_MODE(insn->code) == BPF_ATOMIC ||
- BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
- if (insn->imm == BPF_CMPXCHG)
- return BPF_REG_0;
- else if (insn->imm == BPF_LOAD_ACQ)
- return insn->dst_reg;
- else if (insn->imm & BPF_FETCH)
- return insn->src_reg;
- }
- return -1;
- default:
- return insn->dst_reg;
- }
-}
-
-/* Return TRUE if INSN has defined any 32-bit value explicitly. */
-static bool insn_has_def32(struct bpf_insn *insn)
-{
- int dst_reg = insn_def_regno(insn);
-
- if (dst_reg == -1)
- return false;
-
- return !is_reg64(insn, dst_reg, NULL, DST_OP);
-}
-
static void mark_insn_zext(struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
@@ -3867,21 +3446,16 @@ static void mark_insn_zext(struct bpf_verifier_env *env,
}
static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
- enum reg_arg_type t)
+ enum bpf_reg_arg_type t)
{
struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
struct bpf_reg_state *reg;
bool rw64;
- if (regno >= MAX_BPF_REG) {
- verbose(env, "R%d is invalid\n", regno);
- return -EINVAL;
- }
-
mark_reg_scratched(env, regno);
reg = &regs[regno];
- rw64 = is_reg64(insn, regno, reg, t);
+ rw64 = bpf_is_reg64(insn, regno, reg, t);
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (reg->type == NOT_INIT) {
@@ -3910,7 +3484,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r
}
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
- enum reg_arg_type t)
+ enum bpf_reg_arg_type t)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
@@ -3923,24 +3497,9 @@ static int insn_stack_access_flags(int frameno, int spi)
return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
}
-static int insn_stack_access_spi(int insn_flags)
-{
- return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
-}
-
-static int insn_stack_access_frameno(int insn_flags)
-{
- return insn_flags & INSN_F_FRAMENO_MASK;
-}
-
-static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
-{
- env->insn_aux_data[idx].jmp_point = true;
-}
-
-static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+static void mark_indirect_target(struct bpf_verifier_env *env, int idx)
{
- return env->insn_aux_data[insn_idx].jmp_point;
+ env->insn_aux_data[idx].indirect_target = true;
}
#define LR_FRAMENO_BITS 3
@@ -4021,91 +3580,6 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s)
}
}
-/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags, u64 linked_regs)
-{
- u32 cnt = cur->jmp_history_cnt;
- struct bpf_jmp_history_entry *p;
- size_t alloc_size;
-
- /* combine instruction flags if we already recorded this instruction */
- if (env->cur_hist_ent) {
- /* atomic instructions push insn_flags twice, for READ and
- * WRITE sides, but they should agree on stack slot
- */
- verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
- (env->cur_hist_ent->flags & insn_flags) != insn_flags,
- env, "insn history: insn_idx %d cur flags %x new flags %x",
- env->insn_idx, env->cur_hist_ent->flags, insn_flags);
- env->cur_hist_ent->flags |= insn_flags;
- verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
- "insn history: insn_idx %d linked_regs: %#llx",
- env->insn_idx, env->cur_hist_ent->linked_regs);
- env->cur_hist_ent->linked_regs = linked_regs;
- return 0;
- }
-
- cnt++;
- alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
- p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT);
- if (!p)
- return -ENOMEM;
- cur->jmp_history = p;
-
- p = &cur->jmp_history[cnt - 1];
- p->idx = env->insn_idx;
- p->prev_idx = env->prev_insn_idx;
- p->flags = insn_flags;
- p->linked_regs = linked_regs;
- cur->jmp_history_cnt = cnt;
- env->cur_hist_ent = p;
-
- return 0;
-}
-
-static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
- u32 hist_end, int insn_idx)
-{
- if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
- return &st->jmp_history[hist_end - 1];
- return NULL;
-}
-
-/* Backtrack one insn at a time. If idx is not at the top of recorded
- * history then previous instruction came from straight line execution.
- * Return -ENOENT if we exhausted all instructions within given state.
- *
- * It's legal to have a bit of a looping with the same starting and ending
- * insn index within the same state, e.g.: 3->4->5->3, so just because current
- * instruction index is the same as state's first_idx doesn't mean we are
- * done. If there is still some jump history left, we should keep going. We
- * need to take into account that we might have a jump history between given
- * state's parent and itself, due to checkpointing. In this case, we'll have
- * history entry recording a jump from last instruction of parent state and
- * first instruction of given state.
- */
-static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
- u32 *history)
-{
- u32 cnt = *history;
-
- if (i == st->first_insn_idx) {
- if (cnt == 0)
- return -ENOENT;
- if (cnt == 1 && st->jmp_history[0].idx == i)
- return -ENOENT;
- }
-
- if (cnt && st->jmp_history[cnt - 1].idx == i) {
- i = st->jmp_history[cnt - 1].prev_idx;
- (*history)--;
- } else {
- i--;
- }
- return i;
-}
-
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
{
const struct btf_type *func;
@@ -4122,7 +3596,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
return btf_name_by_offset(desc_btf, func->name_off);
}
-static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
+void bpf_verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -4133,158 +3607,10 @@ static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
-static inline void bt_init(struct backtrack_state *bt, u32 frame)
-{
- bt->frame = frame;
-}
-
-static inline void bt_reset(struct backtrack_state *bt)
-{
- struct bpf_verifier_env *env = bt->env;
-
- memset(bt, 0, sizeof(*bt));
- bt->env = env;
-}
-
-static inline u32 bt_empty(struct backtrack_state *bt)
-{
- u64 mask = 0;
- int i;
-
- for (i = 0; i <= bt->frame; i++)
- mask |= bt->reg_masks[i] | bt->stack_masks[i];
-
- return mask == 0;
-}
-
-static inline int bt_subprog_enter(struct backtrack_state *bt)
-{
- if (bt->frame == MAX_CALL_FRAMES - 1) {
- verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
- return -EFAULT;
- }
- bt->frame++;
- return 0;
-}
-
-static inline int bt_subprog_exit(struct backtrack_state *bt)
-{
- if (bt->frame == 0) {
- verifier_bug(bt->env, "subprog exit from frame 0");
- return -EFAULT;
- }
- bt->frame--;
- return 0;
-}
-
-static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
-{
- bt->reg_masks[frame] |= 1 << reg;
-}
-
-static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
-{
- bt->reg_masks[frame] &= ~(1 << reg);
-}
-
-static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
-{
- bt_set_frame_reg(bt, bt->frame, reg);
-}
-
-static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
-{
- bt_clear_frame_reg(bt, bt->frame, reg);
-}
-
-static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
-{
- bt->stack_masks[frame] |= 1ull << slot;
-}
-
-static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
-{
- bt->stack_masks[frame] &= ~(1ull << slot);
-}
-
-static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
-{
- return bt->reg_masks[frame];
-}
-
-static inline u32 bt_reg_mask(struct backtrack_state *bt)
-{
- return bt->reg_masks[bt->frame];
-}
-
-static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
-{
- return bt->stack_masks[frame];
-}
-
-static inline u64 bt_stack_mask(struct backtrack_state *bt)
-{
- return bt->stack_masks[bt->frame];
-}
-
-static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
-{
- return bt->reg_masks[bt->frame] & (1 << reg);
-}
-
-static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
-{
- return bt->reg_masks[frame] & (1 << reg);
-}
-
-static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
-{
- return bt->stack_masks[frame] & (1ull << slot);
-}
-
-/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
-static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
-{
- DECLARE_BITMAP(mask, 64);
- bool first = true;
- int i, n;
-
- buf[0] = '\0';
-
- bitmap_from_u64(mask, reg_mask);
- for_each_set_bit(i, mask, 32) {
- n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
- first = false;
- buf += n;
- buf_sz -= n;
- if (buf_sz < 0)
- break;
- }
-}
-/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
-void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
-{
- DECLARE_BITMAP(mask, 64);
- bool first = true;
- int i, n;
-
- buf[0] = '\0';
-
- bitmap_from_u64(mask, stack_mask);
- for_each_set_bit(i, mask, 64) {
- n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
- first = false;
- buf += n;
- buf_sz -= n;
- if (buf_sz < 0)
- break;
- }
-}
-
/* If any register R in hist->linked_regs is marked as precise in bt,
* do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
*/
-static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
+void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
{
struct linked_regs linked_regs;
bool some_precise = false;
@@ -4311,713 +3637,15 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo
struct linked_reg *e = &linked_regs.entries[i];
if (e->is_reg)
- bt_set_frame_reg(bt, e->frameno, e->regno);
+ bpf_bt_set_frame_reg(bt, e->frameno, e->regno);
else
- bt_set_frame_slot(bt, e->frameno, e->spi);
- }
-}
-
-/* For given verifier state backtrack_insn() is called from the last insn to
- * the first insn. Its purpose is to compute a bitmask of registers and
- * stack slots that needs precision in the parent verifier state.
- *
- * @idx is an index of the instruction we are currently processing;
- * @subseq_idx is an index of the subsequent instruction that:
- * - *would be* executed next, if jump history is viewed in forward order;
- * - *was* processed previously during backtracking.
- */
-static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
-{
- struct bpf_insn *insn = env->prog->insnsi + idx;
- u8 class = BPF_CLASS(insn->code);
- u8 opcode = BPF_OP(insn->code);
- u8 mode = BPF_MODE(insn->code);
- u32 dreg = insn->dst_reg;
- u32 sreg = insn->src_reg;
- u32 spi, i, fr;
-
- if (insn->code == 0)
- return 0;
- if (env->log.level & BPF_LOG_LEVEL2) {
- fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
- verbose(env, "mark_precise: frame%d: regs=%s ",
- bt->frame, env->tmp_str_buf);
- bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
- verbose(env, "stack=%s before ", env->tmp_str_buf);
- verbose(env, "%d: ", idx);
- verbose_insn(env, insn);
- }
-
- /* If there is a history record that some registers gained range at this insn,
- * propagate precision marks to those registers, so that bt_is_reg_set()
- * accounts for these registers.
- */
- bt_sync_linked_regs(bt, hist);
-
- if (class == BPF_ALU || class == BPF_ALU64) {
- if (!bt_is_reg_set(bt, dreg))
- return 0;
- if (opcode == BPF_END || opcode == BPF_NEG) {
- /* sreg is reserved and unused
- * dreg still need precision before this insn
- */
- return 0;
- } else if (opcode == BPF_MOV) {
- if (BPF_SRC(insn->code) == BPF_X) {
- /* dreg = sreg or dreg = (s8, s16, s32)sreg
- * dreg needs precision after this insn
- * sreg needs precision before this insn
- */
- bt_clear_reg(bt, dreg);
- if (sreg != BPF_REG_FP)
- bt_set_reg(bt, sreg);
- } else {
- /* dreg = K
- * dreg needs precision after this insn.
- * Corresponding register is already marked
- * as precise=true in this verifier state.
- * No further markings in parent are necessary
- */
- bt_clear_reg(bt, dreg);
- }
- } else {
- if (BPF_SRC(insn->code) == BPF_X) {
- /* dreg += sreg
- * both dreg and sreg need precision
- * before this insn
- */
- if (sreg != BPF_REG_FP)
- bt_set_reg(bt, sreg);
- } /* else dreg += K
- * dreg still needs precision before this insn
- */
- }
- } else if (class == BPF_LDX || is_atomic_load_insn(insn)) {
- if (!bt_is_reg_set(bt, dreg))
- return 0;
- bt_clear_reg(bt, dreg);
-
- /* scalars can only be spilled into stack w/o losing precision.
- * Load from any other memory can be zero extended.
- * The desire to keep that precision is already indicated
- * by 'precise' mark in corresponding register of this state.
- * No further tracking necessary.
- */
- if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
- return 0;
- /* dreg = *(u64 *)[fp - off] was a fill from the stack.
- * that [fp - off] slot contains scalar that needs to be
- * tracked with precision
- */
- spi = insn_stack_access_spi(hist->flags);
- fr = insn_stack_access_frameno(hist->flags);
- bt_set_frame_slot(bt, fr, spi);
- } else if (class == BPF_STX || class == BPF_ST) {
- if (bt_is_reg_set(bt, dreg))
- /* stx & st shouldn't be using _scalar_ dst_reg
- * to access memory. It means backtracking
- * encountered a case of pointer subtraction.
- */
- return -ENOTSUPP;
- /* scalars can only be spilled into stack */
- if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
- return 0;
- spi = insn_stack_access_spi(hist->flags);
- fr = insn_stack_access_frameno(hist->flags);
- if (!bt_is_frame_slot_set(bt, fr, spi))
- return 0;
- bt_clear_frame_slot(bt, fr, spi);
- if (class == BPF_STX)
- bt_set_reg(bt, sreg);
- } else if (class == BPF_JMP || class == BPF_JMP32) {
- if (bpf_pseudo_call(insn)) {
- int subprog_insn_idx, subprog;
-
- subprog_insn_idx = idx + insn->imm + 1;
- subprog = find_subprog(env, subprog_insn_idx);
- if (subprog < 0)
- return -EFAULT;
-
- if (subprog_is_global(env, subprog)) {
- /* check that jump history doesn't have any
- * extra instructions from subprog; the next
- * instruction after call to global subprog
- * should be literally next instruction in
- * caller program
- */
- verifier_bug_if(idx + 1 != subseq_idx, env,
- "extra insn from subprog");
- /* r1-r5 are invalidated after subprog call,
- * so for global func call it shouldn't be set
- * anymore
- */
- if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verifier_bug(env, "global subprog unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
- /* global subprog always sets R0 */
- bt_clear_reg(bt, BPF_REG_0);
- return 0;
- } else {
- /* static subprog call instruction, which
- * means that we are exiting current subprog,
- * so only r1-r5 could be still requested as
- * precise, r0 and r6-r10 or any stack slot in
- * the current frame should be zero by now
- */
- if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verifier_bug(env, "static subprog unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
- /* we are now tracking register spills correctly,
- * so any instance of leftover slots is a bug
- */
- if (bt_stack_mask(bt) != 0) {
- verifier_bug(env,
- "static subprog leftover stack slots %llx",
- bt_stack_mask(bt));
- return -EFAULT;
- }
- /* propagate r1-r5 to the caller */
- for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
- if (bt_is_reg_set(bt, i)) {
- bt_clear_reg(bt, i);
- bt_set_frame_reg(bt, bt->frame - 1, i);
- }
- }
- if (bt_subprog_exit(bt))
- return -EFAULT;
- return 0;
- }
- } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
- /* exit from callback subprog to callback-calling helper or
- * kfunc call. Use idx/subseq_idx check to discern it from
- * straight line code backtracking.
- * Unlike the subprog call handling above, we shouldn't
- * propagate precision of r1-r5 (if any requested), as they are
- * not actually arguments passed directly to callback subprogs
- */
- if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verifier_bug(env, "callback unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
- if (bt_stack_mask(bt) != 0) {
- verifier_bug(env, "callback leftover stack slots %llx",
- bt_stack_mask(bt));
- return -EFAULT;
- }
- /* clear r1-r5 in callback subprog's mask */
- for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- bt_clear_reg(bt, i);
- if (bt_subprog_exit(bt))
- return -EFAULT;
- return 0;
- } else if (opcode == BPF_CALL) {
- /* kfunc with imm==0 is invalid and fixup_kfunc_call will
- * catch this error later. Make backtracking conservative
- * with ENOTSUPP.
- */
- if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
- return -ENOTSUPP;
- /* regular helper call sets R0 */
- bt_clear_reg(bt, BPF_REG_0);
- if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- /* if backtracking was looking for registers R1-R5
- * they should have been found already.
- */
- verifier_bug(env, "backtracking call unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
- if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
- && subseq_idx - idx != 1) {
- if (bt_subprog_enter(bt))
- return -EFAULT;
- }
- } else if (opcode == BPF_EXIT) {
- bool r0_precise;
-
- /* Backtracking to a nested function call, 'idx' is a part of
- * the inner frame 'subseq_idx' is a part of the outer frame.
- * In case of a regular function call, instructions giving
- * precision to registers R1-R5 should have been found already.
- * In case of a callback, it is ok to have R1-R5 marked for
- * backtracking, as these registers are set by the function
- * invoking callback.
- */
- if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
- for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- bt_clear_reg(bt, i);
- if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verifier_bug(env, "backtracking exit unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
-
- /* BPF_EXIT in subprog or callback always returns
- * right after the call instruction, so by checking
- * whether the instruction at subseq_idx-1 is subprog
- * call or not we can distinguish actual exit from
- * *subprog* from exit from *callback*. In the former
- * case, we need to propagate r0 precision, if
- * necessary. In the former we never do that.
- */
- r0_precise = subseq_idx - 1 >= 0 &&
- bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
- bt_is_reg_set(bt, BPF_REG_0);
-
- bt_clear_reg(bt, BPF_REG_0);
- if (bt_subprog_enter(bt))
- return -EFAULT;
-
- if (r0_precise)
- bt_set_reg(bt, BPF_REG_0);
- /* r6-r9 and stack slots will stay set in caller frame
- * bitmasks until we return back from callee(s)
- */
- return 0;
- } else if (BPF_SRC(insn->code) == BPF_X) {
- if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
- return 0;
- /* dreg <cond> sreg
- * Both dreg and sreg need precision before
- * this insn. If only sreg was marked precise
- * before it would be equally necessary to
- * propagate it to dreg.
- */
- if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
- bt_set_reg(bt, sreg);
- if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
- bt_set_reg(bt, dreg);
- } else if (BPF_SRC(insn->code) == BPF_K) {
- /* dreg <cond> K
- * Only dreg still needs precision before
- * this insn, so for the K-based conditional
- * there is nothing new to be marked.
- */
- }
- } else if (class == BPF_LD) {
- if (!bt_is_reg_set(bt, dreg))
- return 0;
- bt_clear_reg(bt, dreg);
- /* It's ld_imm64 or ld_abs or ld_ind.
- * For ld_imm64 no further tracking of precision
- * into parent is necessary
- */
- if (mode == BPF_IND || mode == BPF_ABS)
- /* to be analyzed */
- return -ENOTSUPP;
- }
- /* Propagate precision marks to linked registers, to account for
- * registers marked as precise in this function.
- */
- bt_sync_linked_regs(bt, hist);
- return 0;
-}
-
-/* the scalar precision tracking algorithm:
- * . at the start all registers have precise=false.
- * . scalar ranges are tracked as normal through alu and jmp insns.
- * . once precise value of the scalar register is used in:
- * . ptr + scalar alu
- * . if (scalar cond K|scalar)
- * . helper_call(.., scalar, ...) where ARG_CONST is expected
- * backtrack through the verifier states and mark all registers and
- * stack slots with spilled constants that these scalar registers
- * should be precise.
- * . during state pruning two registers (or spilled stack slots)
- * are equivalent if both are not precise.
- *
- * Note the verifier cannot simply walk register parentage chain,
- * since many different registers and stack slots could have been
- * used to compute single precise scalar.
- *
- * The approach of starting with precise=true for all registers and then
- * backtrack to mark a register as not precise when the verifier detects
- * that program doesn't care about specific value (e.g., when helper
- * takes register as ARG_ANYTHING parameter) is not safe.
- *
- * It's ok to walk single parentage chain of the verifier states.
- * It's possible that this backtracking will go all the way till 1st insn.
- * All other branches will be explored for needing precision later.
- *
- * The backtracking needs to deal with cases like:
- * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
- * r9 -= r8
- * r5 = r9
- * if r5 > 0x79f goto pc+7
- * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
- * r5 += 1
- * ...
- * call bpf_perf_event_output#25
- * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
- *
- * and this case:
- * r6 = 1
- * call foo // uses callee's r6 inside to compute r0
- * r0 += r6
- * if r0 == 0 goto
- *
- * to track above reg_mask/stack_mask needs to be independent for each frame.
- *
- * Also if parent's curframe > frame where backtracking started,
- * the verifier need to mark registers in both frames, otherwise callees
- * may incorrectly prune callers. This is similar to
- * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
- *
- * For now backtracking falls back into conservative marking.
- */
-static void mark_all_scalars_precise(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
-{
- struct bpf_func_state *func;
- struct bpf_reg_state *reg;
- int i, j;
-
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
- st->curframe);
- }
-
- /* big hammer: mark all scalars precise in this path.
- * pop_stack may still get !precise scalars.
- * We also skip current state and go straight to first parent state,
- * because precision markings in current non-checkpointed state are
- * not needed. See why in the comment in __mark_chain_precision below.
- */
- for (st = st->parent; st; st = st->parent) {
- for (i = 0; i <= st->curframe; i++) {
- func = st->frame[i];
- for (j = 0; j < BPF_REG_FP; j++) {
- reg = &func->regs[j];
- if (reg->type != SCALAR_VALUE || reg->precise)
- continue;
- reg->precise = true;
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
- i, j);
- }
- }
- for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (!is_spilled_reg(&func->stack[j]))
- continue;
- reg = &func->stack[j].spilled_ptr;
- if (reg->type != SCALAR_VALUE || reg->precise)
- continue;
- reg->precise = true;
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
- i, -(j + 1) * 8);
- }
- }
- }
- }
-}
-
-static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_func_state *func;
- struct bpf_reg_state *reg;
- int i, j;
-
- for (i = 0; i <= st->curframe; i++) {
- func = st->frame[i];
- for (j = 0; j < BPF_REG_FP; j++) {
- reg = &func->regs[j];
- if (reg->type != SCALAR_VALUE)
- continue;
- reg->precise = false;
- }
- for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (!is_spilled_reg(&func->stack[j]))
- continue;
- reg = &func->stack[j].spilled_ptr;
- if (reg->type != SCALAR_VALUE)
- continue;
- reg->precise = false;
- }
+ bpf_bt_set_frame_slot(bt, e->frameno, e->spi);
}
}
-/*
- * __mark_chain_precision() backtracks BPF program instruction sequence and
- * chain of verifier states making sure that register *regno* (if regno >= 0)
- * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
- * SCALARS, as well as any other registers and slots that contribute to
- * a tracked state of given registers/stack slots, depending on specific BPF
- * assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded jmp_history and is able to
- * traverse entire chain of parent states. This process ends only when all the
- * necessary registers/slots and their transitive dependencies are marked as
- * precise.
- *
- * One important and subtle aspect is that precise marks *do not matter* in
- * the currently verified state (current state). It is important to understand
- * why this is the case.
- *
- * First, note that current state is the state that is not yet "checkpointed",
- * i.e., it is not yet put into env->explored_states, and it has no children
- * states as well. It's ephemeral, and can end up either a) being discarded if
- * compatible explored state is found at some point or BPF_EXIT instruction is
- * reached or b) checkpointed and put into env->explored_states, branching out
- * into one or more children states.
- *
- * In the former case, precise markings in current state are completely
- * ignored by state comparison code (see regsafe() for details). Only
- * checkpointed ("old") state precise markings are important, and if old
- * state's register/slot is precise, regsafe() assumes current state's
- * register/slot as precise and checks value ranges exactly and precisely. If
- * states turn out to be compatible, current state's necessary precise
- * markings and any required parent states' precise markings are enforced
- * after the fact with propagate_precision() logic, after the fact. But it's
- * important to realize that in this case, even after marking current state
- * registers/slots as precise, we immediately discard current state. So what
- * actually matters is any of the precise markings propagated into current
- * state's parent states, which are always checkpointed (due to b) case above).
- * As such, for scenario a) it doesn't matter if current state has precise
- * markings set or not.
- *
- * Now, for the scenario b), checkpointing and forking into child(ren)
- * state(s). Note that before current state gets to checkpointing step, any
- * processed instruction always assumes precise SCALAR register/slot
- * knowledge: if precise value or range is useful to prune jump branch, BPF
- * verifier takes this opportunity enthusiastically. Similarly, when
- * register's value is used to calculate offset or memory address, exact
- * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
- * what we mentioned above about state comparison ignoring precise markings
- * during state comparison, BPF verifier ignores and also assumes precise
- * markings *at will* during instruction verification process. But as verifier
- * assumes precision, it also propagates any precision dependencies across
- * parent states, which are not yet finalized, so can be further restricted
- * based on new knowledge gained from restrictions enforced by their children
- * states. This is so that once those parent states are finalized, i.e., when
- * they have no more active children state, state comparison logic in
- * is_state_visited() would enforce strict and precise SCALAR ranges, if
- * required for correctness.
- *
- * To build a bit more intuition, note also that once a state is checkpointed,
- * the path we took to get to that state is not important. This is crucial
- * property for state pruning. When state is checkpointed and finalized at
- * some instruction index, it can be correctly and safely used to "short
- * circuit" any *compatible* state that reaches exactly the same instruction
- * index. I.e., if we jumped to that instruction from a completely different
- * code path than original finalized state was derived from, it doesn't
- * matter, current state can be discarded because from that instruction
- * forward having a compatible state will ensure we will safely reach the
- * exit. States describe preconditions for further exploration, but completely
- * forget the history of how we got here.
- *
- * This also means that even if we needed precise SCALAR range to get to
- * finalized state, but from that point forward *that same* SCALAR register is
- * never used in a precise context (i.e., it's precise value is not needed for
- * correctness), it's correct and safe to mark such register as "imprecise"
- * (i.e., precise marking set to false). This is what we rely on when we do
- * not set precise marking in current state. If no child state requires
- * precision for any given SCALAR register, it's safe to dictate that it can
- * be imprecise. If any child state does require this register to be precise,
- * we'll mark it precise later retroactively during precise markings
- * propagation from child state to parent states.
- *
- * Skipping precise marking setting in current state is a mild version of
- * relying on the above observation. But we can utilize this property even
- * more aggressively by proactively forgetting any precise marking in the
- * current state (which we inherited from the parent state), right before we
- * checkpoint it and branch off into new child state. This is done by
- * mark_all_scalars_imprecise() to hopefully get more permissive and generic
- * finalized states which help in short circuiting more future states.
- */
-static int __mark_chain_precision(struct bpf_verifier_env *env,
- struct bpf_verifier_state *starting_state,
- int regno,
- bool *changed)
-{
- struct bpf_verifier_state *st = starting_state;
- struct backtrack_state *bt = &env->bt;
- int first_idx = st->first_insn_idx;
- int last_idx = starting_state->insn_idx;
- int subseq_idx = -1;
- struct bpf_func_state *func;
- bool tmp, skip_first = true;
- struct bpf_reg_state *reg;
- int i, fr, err;
-
- if (!env->bpf_capable)
- return 0;
-
- changed = changed ?: &tmp;
- /* set frame number from which we are starting to backtrack */
- bt_init(bt, starting_state->curframe);
-
- /* Do sanity checks against current state of register and/or stack
- * slot, but don't set precise flag in current state, as precision
- * tracking in the current state is unnecessary.
- */
- func = st->frame[bt->frame];
- if (regno >= 0) {
- reg = &func->regs[regno];
- if (reg->type != SCALAR_VALUE) {
- verifier_bug(env, "backtracking misuse");
- return -EFAULT;
- }
- bt_set_reg(bt, regno);
- }
-
- if (bt_empty(bt))
- return 0;
-
- for (;;) {
- DECLARE_BITMAP(mask, 64);
- u32 history = st->jmp_history_cnt;
- struct bpf_jmp_history_entry *hist;
-
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
- bt->frame, last_idx, first_idx, subseq_idx);
- }
-
- if (last_idx < 0) {
- /* we are at the entry into subprog, which
- * is expected for global funcs, but only if
- * requested precise registers are R1-R5
- * (which are global func's input arguments)
- */
- if (st->curframe == 0 &&
- st->frame[0]->subprogno > 0 &&
- st->frame[0]->callsite == BPF_MAIN_FUNC &&
- bt_stack_mask(bt) == 0 &&
- (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
- bitmap_from_u64(mask, bt_reg_mask(bt));
- for_each_set_bit(i, mask, 32) {
- reg = &st->frame[0]->regs[i];
- bt_clear_reg(bt, i);
- if (reg->type == SCALAR_VALUE) {
- reg->precise = true;
- *changed = true;
- }
- }
- return 0;
- }
-
- verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
- st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
- return -EFAULT;
- }
-
- for (i = last_idx;;) {
- if (skip_first) {
- err = 0;
- skip_first = false;
- } else {
- hist = get_jmp_hist_entry(st, history, i);
- err = backtrack_insn(env, i, subseq_idx, hist, bt);
- }
- if (err == -ENOTSUPP) {
- mark_all_scalars_precise(env, starting_state);
- bt_reset(bt);
- return 0;
- } else if (err) {
- return err;
- }
- if (bt_empty(bt))
- /* Found assignment(s) into tracked register in this state.
- * Since this state is already marked, just return.
- * Nothing to be tracked further in the parent state.
- */
- return 0;
- subseq_idx = i;
- i = get_prev_insn_idx(st, i, &history);
- if (i == -ENOENT)
- break;
- if (i >= env->prog->len) {
- /* This can happen if backtracking reached insn 0
- * and there are still reg_mask or stack_mask
- * to backtrack.
- * It means the backtracking missed the spot where
- * particular register was initialized with a constant.
- */
- verifier_bug(env, "backtracking idx %d", i);
- return -EFAULT;
- }
- }
- st = st->parent;
- if (!st)
- break;
-
- for (fr = bt->frame; fr >= 0; fr--) {
- func = st->frame[fr];
- bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (reg->type != SCALAR_VALUE) {
- bt_clear_frame_reg(bt, fr, i);
- continue;
- }
- if (reg->precise) {
- bt_clear_frame_reg(bt, fr, i);
- } else {
- reg->precise = true;
- *changed = true;
- }
- }
-
- bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
- for_each_set_bit(i, mask, 64) {
- if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
- env, "stack slot %d, total slots %d",
- i, func->allocated_stack / BPF_REG_SIZE))
- return -EFAULT;
-
- if (!is_spilled_scalar_reg(&func->stack[i])) {
- bt_clear_frame_slot(bt, fr, i);
- continue;
- }
- reg = &func->stack[i].spilled_ptr;
- if (reg->precise) {
- bt_clear_frame_slot(bt, fr, i);
- } else {
- reg->precise = true;
- *changed = true;
- }
- }
- if (env->log.level & BPF_LOG_LEVEL2) {
- fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
- bt_frame_reg_mask(bt, fr));
- verbose(env, "mark_precise: frame%d: parent state regs=%s ",
- fr, env->tmp_str_buf);
- bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
- bt_frame_stack_mask(bt, fr));
- verbose(env, "stack=%s: ", env->tmp_str_buf);
- print_verifier_state(env, st, fr, true);
- }
- }
-
- if (bt_empty(bt))
- return 0;
-
- subseq_idx = first_idx;
- last_idx = st->last_insn_idx;
- first_idx = st->first_insn_idx;
- }
-
- /* if we still have requested precise regs or slots, we missed
- * something (e.g., stack access through non-r10 register), so
- * fallback to marking all precise
- */
- if (!bt_empty(bt)) {
- mark_all_scalars_precise(env, starting_state);
- bt_reset(bt);
- }
-
- return 0;
-}
-
int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, env->cur_state, regno, NULL);
+ return bpf_mark_chain_precision(env, env->cur_state, regno, NULL);
}
/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
@@ -5026,7 +3654,7 @@ int mark_chain_precision(struct bpf_verifier_env *env, int regno)
static int mark_chain_precision_batch(struct bpf_verifier_env *env,
struct bpf_verifier_state *starting_state)
{
- return __mark_chain_precision(env, starting_state, -1, NULL);
+ return bpf_mark_chain_precision(env, starting_state, -1, NULL);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -5056,11 +3684,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
}
}
-/* Does this register contain a constant zero? */
-static bool register_is_null(struct bpf_reg_state *reg)
-{
- return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
-}
/* check if register is a constant scalar value */
static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
@@ -5084,27 +3707,30 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+static void clear_scalar_id(struct bpf_reg_state *reg)
+{
+ reg->id = 0;
+ reg->delta = 0;
+}
+
static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
struct bpf_reg_state *src_reg)
{
if (src_reg->type != SCALAR_VALUE)
return;
-
- if (src_reg->id & BPF_ADD_CONST) {
- /*
- * The verifier is processing rX = rY insn and
- * rY->id has special linked register already.
- * Cleared it, since multiple rX += const are not supported.
- */
- src_reg->id = 0;
- src_reg->off = 0;
- }
-
+ /*
+ * The verifier is processing rX = rY insn and
+ * rY->id has special linked register already.
+ * Cleared it, since multiple rX += const are not supported.
+ */
+ if (src_reg->id & BPF_ADD_CONST)
+ clear_scalar_id(src_reg);
+ /*
+ * Ensure that src_reg has a valid ID that will be copied to
+ * dst_reg and then will be used by sync_linked_regs() to
+ * propagate min/max range.
+ */
if (!src_reg->id && !tnum_is_const(src_reg->var_off))
- /* Ensure that src_reg has a valid ID that will be copied to
- * dst_reg and then will be used by sync_linked_regs() to
- * propagate min/max range.
- */
src_reg->id = ++env->id_gen;
}
@@ -5166,6 +3792,18 @@ static void check_fastcall_stack_contract(struct bpf_verifier_env *env,
}
}
+static void scrub_special_slot(struct bpf_func_state *state, int spi)
+{
+ int i;
+
+ /* regular write of data into stack destroys any spilled ptr */
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ scrub_spilled_slot(&state->stack[spi].slot_type[i]);
+}
+
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -5185,8 +3823,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
* so it's aligned access and [off, off + size) are within stack limits
*/
if (!env->allow_ptr_leaks &&
- is_spilled_reg(&state->stack[spi]) &&
- !is_spilled_scalar_reg(&state->stack[spi]) &&
+ bpf_is_spilled_reg(&state->stack[spi]) &&
+ !bpf_is_spilled_scalar_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -5215,18 +3853,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
- if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) {
- /* only mark the slot as written if all 8 bytes were written
- * otherwise read propagation may incorrectly stop too soon
- * when stack slots are partially written.
- * This heuristic means that read propagation will be
- * conservative, since it will add reg_live_read marks
- * to stack slots all the way to first state when programs
- * writes+reads less than 8 bytes
- */
- bpf_mark_stack_write(env, state->frameno, BIT(spi));
- }
-
check_fastcall_stack_contract(env, state, insn_idx, off);
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
@@ -5263,15 +3889,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
} else {
u8 type = STACK_MISC;
- /* regular write of data into stack destroys any spilled ptr */
- state->stack[spi].spilled_ptr.type = NOT_INIT;
- /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
- if (is_stack_slot_special(&state->stack[spi]))
- for (i = 0; i < BPF_REG_SIZE; i++)
- scrub_spilled_slot(&state->stack[spi].slot_type[i]);
+ scrub_special_slot(state, spi);
/* when we zero initialize stack slots mark them as such */
- if ((reg && register_is_null(reg)) ||
+ if ((reg && bpf_register_is_null(reg)) ||
(!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
/* STACK_ZERO case happened because register spill
* wasn't properly aligned at the stack slot boundary,
@@ -5292,7 +3913,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5302,7 +3923,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
* tracks the effects of the write, considering that each stack slot in the
* dynamic range is potentially written to.
*
- * 'off' includes 'regno->off'.
* 'value_regno' can be -1, meaning that an unknown value is being written to
* the stack.
*
@@ -5338,14 +3958,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
max_off = ptr_reg->smax_value + off + size;
if (value_regno >= 0)
value_reg = &cur->regs[value_regno];
- if ((value_reg && register_is_null(value_reg)) ||
+ if ((value_reg && bpf_register_is_null(value_reg)) ||
(!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
for (i = min_off; i < max_off; i++) {
int spi;
- spi = __get_spi(i);
+ spi = bpf_get_spi(i);
err = destroy_if_dynptr_stack_slot(env, state, spi);
if (err)
return err;
@@ -5383,7 +4003,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
* maintain the spill type.
*/
if (writing_zero && *stype == STACK_SPILL &&
- is_spilled_scalar_reg(&state->stack[spi])) {
+ bpf_is_spilled_scalar_reg(&state->stack[spi])) {
struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;
if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
@@ -5392,8 +4012,13 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
}
}
- /* Erase all other spilled pointers. */
- state->stack[spi].spilled_ptr.type = NOT_INIT;
+ /*
+ * Scrub slots if variable-offset stack write goes over spilled pointers.
+ * Otherwise bpf_is_spilled_reg() may == true && spilled_ptr.type == NOT_INIT
+ * and valid program is rejected by check_stack_read_fixed_off()
+ * with obscure "invalid size of register fill" message.
+ */
+ scrub_special_slot(state, spi);
/* Update the slot type. */
new_type = STACK_MISC;
@@ -5408,8 +4033,10 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
* For privileged programs, we will accept such reads to slots
* that may or may not be written because, if we're reject
* them, the error would be too confusing.
+ * Conservatively, treat STACK_POISON in a similar way.
*/
- if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+ if ((*stype == STACK_INVALID || *stype == STACK_POISON) &&
+ !env->allow_uninit_stack) {
verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
insn_idx, i);
return -EINVAL;
@@ -5484,18 +4111,14 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
u8 *stype, type;
int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
- int err;
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
mark_stack_slot_scratched(env, spi);
check_fastcall_stack_contract(env, state, env->insn_idx, off);
- err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi));
- if (err)
- return err;
- if (is_spilled_reg(&reg_state->stack[spi])) {
+ if (bpf_is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
@@ -5531,7 +4154,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
* coerce_reg_to_size will adjust the boundaries.
*/
if (get_reg_width(reg) > size * BITS_PER_BYTE)
- state->regs[dst_regno].id = 0;
+ clear_scalar_id(&state->regs[dst_regno]);
} else {
int spill_cnt = 0, zero_cnt = 0;
@@ -5549,8 +4172,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
}
if (type == STACK_INVALID && env->allow_uninit_stack)
continue;
- verbose(env, "invalid read from stack off %d+%d size %d\n",
- off, i, size);
+ if (type == STACK_POISON) {
+ verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
+ off, i, size);
+ } else {
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ }
return -EACCES;
}
@@ -5599,8 +4227,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
continue;
if (type == STACK_INVALID && env->allow_uninit_stack)
continue;
- verbose(env, "invalid read from stack off %d+%d size %d\n",
- off, i, size);
+ if (type == STACK_POISON) {
+ verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
+ off, i, size);
+ } else {
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ }
return -EACCES;
}
if (dst_regno >= 0)
@@ -5608,7 +4241,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5646,7 +4279,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
{
/* The state of the source register. */
struct bpf_reg_state *reg = reg_state(env, ptr_regno);
- struct bpf_func_state *ptr_state = func(env, reg);
+ struct bpf_func_state *ptr_state = bpf_func(env, reg);
int err;
int min_off, max_off;
@@ -5678,7 +4311,7 @@ static int check_stack_read(struct bpf_verifier_env *env,
int dst_regno)
{
struct bpf_reg_state *reg = reg_state(env, ptr_regno);
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int err;
/* Some accesses are only permitted with a static offset. */
bool var_off = !tnum_is_const(reg->var_off);
@@ -5724,7 +4357,6 @@ static int check_stack_read(struct bpf_verifier_env *env,
* check_stack_write_var_off.
*
* 'ptr_regno' is the register used as a pointer into the stack.
- * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
* 'value_regno' is the register whose value we're writing to the stack. It can
* be -1, meaning that we're not writing from a register.
*
@@ -5735,7 +4367,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
int value_regno, int insn_idx)
{
struct bpf_reg_state *reg = reg_state(env, ptr_regno);
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int err;
if (tnum_is_const(reg->var_off)) {
@@ -5761,14 +4393,14 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
u32 cap = bpf_map_flags_to_cap(map);
if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
- verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
- map->value_size, off, size);
+ verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n",
+ map->value_size, reg->smin_value + off, size);
return -EACCES;
}
if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
- verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
- map->value_size, off, size);
+ verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n",
+ map->value_size, reg->smin_value + off, size);
return -EACCES;
}
@@ -5802,6 +4434,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, off, mem_size);
break;
+ case PTR_TO_CTX:
+ verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n",
+ mem_size, off, size);
+ break;
case PTR_TO_MEM:
default:
verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
@@ -5875,24 +4511,24 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
* is only allowed in its original, unmodified form.
*/
- if (reg->off < 0) {
- verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
- reg_type_str(env, reg->type), regno, reg->off);
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable %s access var_off=%s disallowed\n",
+ reg_type_str(env, reg->type), tn_buf);
return -EACCES;
}
- if (!fixed_off_ok && reg->off) {
- verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
- reg_type_str(env, reg->type), regno, reg->off);
+ if (reg->smin_value < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->var_off.value);
return -EACCES;
}
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable %s access var_off=%s disallowed\n",
- reg_type_str(env, reg->type), tn_buf);
+ if (!fixed_off_ok && reg->var_off.value != 0) {
+ verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->var_off.value);
return -EACCES;
}
@@ -5913,6 +4549,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
int perm_flags;
const char *reg_name = "";
+ if (base_type(reg->type) != PTR_TO_BTF_ID)
+ goto bad_type;
+
if (btf_is_kernel(reg->btf)) {
perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
@@ -5925,7 +4564,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
perm_flags |= MEM_PERCPU;
}
- if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
+ if (type_flag(reg->type) & ~perm_flags)
goto bad_type;
/* We need to verify reg->type and reg->btf, before accessing reg->btf */
@@ -5934,14 +4573,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
/* For ref_ptr case, release function check should ensure we get one
* referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
* normal store of unreferenced kptr, we must ensure var_off is zero.
- * Since ref_ptr cannot be accessed directly by BPF insns, checks for
- * reg->off and reg->ref_obj_id are not needed here.
+ * Since ref_ptr cannot be accessed directly by BPF insns, check for
+ * reg->ref_obj_id is not needed here.
*/
if (__check_ptr_off_reg(env, reg, regno, true))
return -EACCES;
/* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
- * we also need to take into account the reg->off.
+ * we also need to take into account the reg->var_off.
*
* We want to support cases like:
*
@@ -5952,19 +4591,19 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
*
* struct foo *v;
* v = func(); // PTR_TO_BTF_ID
- * val->foo = v; // reg->off is zero, btf and btf_id match type
- * val->bar = &v->br; // reg->off is still zero, but we need to retry with
+ * val->foo = v; // reg->var_off is zero, btf and btf_id match type
+ * val->bar = &v->br; // reg->var_off is still zero, but we need to retry with
* // first member type of struct after comparison fails
- * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
+ * val->baz = &v->bz; // reg->var_off is non-zero, so struct needs to be walked
* // to match type
*
- * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
+ * In the kptr_ref case, check_func_arg_reg_off already ensures reg->var_off
* is zero. We must also ensure that btf_struct_ids_match does not walk
* the struct to match type against first member of struct, i.e. reject
* second case from above. Hence, when type is BPF_KPTR_REF, we set
* strict mode to true for type match.
*/
- if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value,
kptr_field->kptr.btf, kptr_field->kptr.btf_id,
kptr_field->type != BPF_KPTR_UNREF))
goto bad_type;
@@ -6128,7 +4767,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
return ret;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
- if (!register_is_null(val_reg) &&
+ if (!bpf_register_is_null(val_reg) &&
map_kptr_match_type(env, kptr_field, val_reg, value_regno))
return -EACCES;
} else if (class == BPF_ST) {
@@ -6223,11 +4862,9 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return 0;
}
-#define MAX_PACKET_OFF 0xffff
-
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
- const struct bpf_call_arg_meta *meta,
- enum bpf_access_type t)
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_access_type t)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
@@ -6273,27 +4910,14 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
struct bpf_reg_state *reg = reg_state(env, regno);
int err;
- /* We may have added a variable offset to the packet pointer; but any
- * reg->range we have comes after that. We are only checking the fixed
- * offset.
- */
-
- /* We don't allow negative numbers, because we aren't tracking enough
- * detail to prove they're safe.
- */
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
- regno);
- return -EACCES;
+ if (reg->range < 0) {
+ verbose(env, "R%d offset is outside of the packet\n", regno);
+ return -EINVAL;
}
- err = reg->range < 0 ? -EINVAL :
- __check_mem_access(env, regno, off, size, reg->range,
- zero_size_allowed);
- if (err) {
- verbose(env, "R%d offset is outside of the packet\n", regno);
+ err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed);
+ if (err)
return err;
- }
/* __check_mem_access has made sure "off + size - 1" is within u16.
* reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
@@ -6305,12 +4929,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
max_t(u32, env->prog->aux->max_pkt_offset,
off + reg->umax_value + size - 1);
- return err;
+ return 0;
+}
+
+static bool is_var_ctx_off_allowed(struct bpf_prog *prog)
+{
+ return resolve_prog_type(prog) == BPF_PROG_TYPE_SYSCALL;
}
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
-static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
- enum bpf_access_type t, struct bpf_insn_access_aux *info)
+static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
+ enum bpf_access_type t, struct bpf_insn_access_aux *info)
{
if (env->ops->is_valid_access &&
env->ops->is_valid_access(off, size, t, env->prog, info)) {
@@ -6341,6 +4970,34 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
return -EACCES;
}
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
+ int off, int access_size, enum bpf_access_type t,
+ struct bpf_insn_access_aux *info)
+{
+ /*
+ * Program types that don't rewrite ctx accesses can safely
+ * dereference ctx pointers with fixed offsets.
+ */
+ bool var_off_ok = is_var_ctx_off_allowed(env->prog);
+ bool fixed_off_ok = !env->ops->convert_ctx_access;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
+ int err;
+
+ if (var_off_ok)
+ err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false);
+ else
+ err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
+ if (err)
+ return err;
+ off += reg->umax_value;
+
+ err = __check_ctx_access(env, insn_idx, off, access_size, t, info);
+ if (err)
+ verbose_linfo(env, insn_idx, "; ");
+ return err;
+}
+
static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
int size)
{
@@ -6522,14 +5179,14 @@ static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
*/
ip_align = 2;
- reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + off));
if (!tnum_is_aligned(reg_off, size)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env,
- "misaligned packet access off %d+%s+%d+%d size %d\n",
- ip_align, tn_buf, reg->off, off, size);
+ "misaligned packet access off %d+%s+%d size %d\n",
+ ip_align, tn_buf, off, size);
return -EACCES;
}
@@ -6547,13 +5204,13 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
if (!strict || size == 1)
return 0;
- reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+ reg_off = tnum_add(reg->var_off, tnum_const(off));
if (!tnum_is_aligned(reg_off, size)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
- pointer_desc, tn_buf, reg->off, off, size);
+ verbose(env, "misaligned %saccess off %s+%d size %d\n",
+ pointer_desc, tn_buf, off, size);
return -EACCES;
}
@@ -6656,22 +5313,30 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
return round_up(max_t(u32, stack_depth, 1), 32);
}
+/* temporary state used for call frame depth calculation */
+struct bpf_subprog_call_depth_info {
+ int ret_insn; /* caller instruction where we return to. */
+ int caller; /* caller subprogram idx */
+ int frame; /* # of consecutive static call stack frames on top of stack */
+};
+
/* starting from main bpf function walk all instructions of the function
* and recursively walk all callees that given function can call.
* Ignore jump and exit insns.
- * Since recursion is prevented by check_cfg() this algorithm
- * only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ struct bpf_subprog_call_depth_info *dinfo,
bool priv_stack_supported)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
- int ret_insn[MAX_CALL_FRAMES];
- int ret_prog[MAX_CALL_FRAMES];
- int j;
+ int total;
+ int tmp;
+
+ /* no caller idx */
+ dinfo[idx].caller = -1;
i = subprog[idx].start;
if (!priv_stack_supported)
@@ -6723,8 +5388,12 @@ process_func:
} else {
depth += subprog_depth;
if (depth > MAX_BPF_STACK) {
+ total = 0;
+ for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller)
+ total++;
+
verbose(env, "combined stack size of %d calls is %d. Too large\n",
- frame + 1, depth);
+ total, depth);
return -EACCES;
}
}
@@ -6738,10 +5407,8 @@ continue_func:
if (!is_bpf_throw_kfunc(insn + i))
continue;
- if (subprog[idx].is_cb)
- err = true;
- for (int c = 0; c < frame && !err; c++) {
- if (subprog[ret_prog[c]].is_cb) {
+ for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) {
+ if (subprog[tmp].is_cb) {
err = true;
break;
}
@@ -6757,12 +5424,10 @@ continue_func:
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
- ret_insn[frame] = i + 1;
- ret_prog[frame] = idx;
/* find the callee */
next_insn = i + insn[i].imm + 1;
- sidx = find_subprog(env, next_insn);
+ sidx = bpf_find_subprog(env, next_insn);
if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn))
return -EFAULT;
if (subprog[sidx].is_async_cb) {
@@ -6778,7 +5443,16 @@ continue_func:
return -EINVAL;
}
}
+
+ /* store caller info for after we return from callee */
+ dinfo[idx].frame = frame;
+ dinfo[idx].ret_insn = i + 1;
+
+ /* push caller idx into callee's dinfo */
+ dinfo[sidx].caller = idx;
+
i = next_insn;
+
idx = sidx;
if (!priv_stack_supported)
subprog[idx].priv_stack_mode = NO_PRIV_STACK;
@@ -6786,7 +5460,7 @@ continue_func:
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
- frame++;
+ frame = bpf_subprog_is_global(env, idx) ? 0 : frame + 1;
if (frame >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep !\n",
frame);
@@ -6800,12 +5474,12 @@ continue_func:
* tail call counter throughout bpf2bpf calls combined with tailcalls
*/
if (tail_call_reachable)
- for (j = 0; j < frame; j++) {
- if (subprog[ret_prog[j]].is_exception_cb) {
+ for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) {
+ if (subprog[tmp].is_exception_cb) {
verbose(env, "cannot tail call within exception cb\n");
return -EINVAL;
}
- subprog[ret_prog[j]].tail_call_reachable = true;
+ subprog[tmp].tail_call_reachable = true;
}
if (subprog[0].tail_call_reachable)
env->prog->aux->tail_call_reachable = true;
@@ -6813,23 +5487,33 @@ continue_func:
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
*/
- if (frame == 0)
+ if (frame == 0 && dinfo[idx].caller < 0)
return 0;
if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
- frame--;
- i = ret_insn[frame];
- idx = ret_prog[frame];
+
+ /* pop caller idx from callee */
+ idx = dinfo[idx].caller;
+
+ /* retrieve caller state from its frame */
+ frame = dinfo[idx].frame;
+ i = dinfo[idx].ret_insn;
+
goto continue_func;
}
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
+ struct bpf_subprog_call_depth_info *dinfo;
struct bpf_subprog_info *si = env->subprog_info;
bool priv_stack_supported;
int ret;
+ dinfo = kvcalloc(env->subprog_cnt, sizeof(*dinfo), GFP_KERNEL_ACCOUNT);
+ if (!dinfo)
+ return -ENOMEM;
+
for (int i = 0; i < env->subprog_cnt; i++) {
if (si[i].has_tail_call) {
priv_stack_mode = NO_PRIV_STACK;
@@ -6851,9 +5535,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
for (int i = env->subprog_cnt - 1; i >= 0; i--) {
if (!i || si[i].is_async_cb) {
priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
- ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
- if (ret < 0)
+ ret = check_max_stack_depth_subprog(env, i, dinfo,
+ priv_stack_supported);
+ if (ret < 0) {
+ kvfree(dinfo);
return ret;
+ }
}
}
@@ -6864,21 +5551,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
}
}
- return 0;
-}
+ kvfree(dinfo);
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-static int get_callee_stack_depth(struct bpf_verifier_env *env,
- const struct bpf_insn *insn, int idx)
-{
- int start = idx + insn->imm + 1, subprog;
-
- subprog = find_subprog(env, start);
- if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
- return -EFAULT;
- return env->subprog_info[subprog].stack_depth;
+ return 0;
}
-#endif
static int __check_buffer_access(struct bpf_verifier_env *env,
const char *buf_info,
@@ -6891,7 +5567,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
regno, buf_info, off, size);
return -EACCES;
}
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ if (!tnum_is_const(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -6914,8 +5590,8 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
if (err)
return err;
- if (off + size > env->prog->aux->max_tp_access)
- env->prog->aux->max_tp_access = off + size;
+ env->prog->aux->max_tp_access = max(reg->var_off.value + off + size,
+ env->prog->aux->max_tp_access);
return 0;
}
@@ -6933,8 +5609,7 @@ static int check_buffer_access(struct bpf_verifier_env *env,
if (err)
return err;
- if (off + size > *max_access)
- *max_access = off + size;
+ *max_access = max(reg->var_off.value + off + size, *max_access);
return 0;
}
@@ -7121,7 +5796,7 @@ out:
set_sext32_default_val(reg, size);
}
-static bool bpf_map_is_rdonly(const struct bpf_map *map)
+bool bpf_map_is_rdonly(const struct bpf_map *map)
{
/* A map is considered read-only if the following condition are true:
*
@@ -7141,8 +5816,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map)
!bpf_map_write_active(map);
}
-static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
- bool is_ldsx)
+int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
+ bool is_ldsx)
{
void *ptr;
u64 addr;
@@ -7327,13 +6002,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
tname);
return -EINVAL;
}
- if (off < 0) {
- verbose(env,
- "R%d is ptr_%s invalid negative access: off=%d\n",
- regno, tname, off);
- return -EACCES;
- }
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+
+ if (!tnum_is_const(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -7343,6 +6013,15 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
+ off += reg->var_off.value;
+
+ if (off < 0) {
+ verbose(env,
+ "R%d is ptr_%s invalid negative access: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
if (reg->type & MEM_USER) {
verbose(env,
"R%d is ptr_%s access user memory: off=%d\n",
@@ -7554,7 +6233,7 @@ static int check_stack_access_within_bounds(
enum bpf_access_type type)
{
struct bpf_reg_state *reg = reg_state(env, regno);
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
s64 min_off, max_off;
int err;
char *err_extra;
@@ -7589,8 +6268,8 @@ static int check_stack_access_within_bounds(
if (err) {
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid%s stack R%d off=%d size=%d\n",
- err_extra, regno, off, access_size);
+ verbose(env, "invalid%s stack R%d off=%lld size=%d\n",
+ err_extra, regno, min_off, access_size);
} else {
char tn_buf[48];
@@ -7618,6 +6297,23 @@ static bool get_func_retval_range(struct bpf_prog *prog,
return false;
}
+static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val)
+{
+ struct bpf_reg_state fake_reg;
+
+ if (!val)
+ return;
+
+ fake_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&fake_reg, val);
+
+ scalar32_min_max_add(dst_reg, &fake_reg);
+ scalar_min_max_add(dst_reg, &fake_reg);
+ dst_reg->var_off = tnum_add(dst_reg->var_off, fake_reg.var_off);
+
+ reg_bounds_sync(dst_reg);
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -7636,14 +6332,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (size < 0)
return size;
- /* alignment checks will add in reg->off themselves */
err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
if (err)
return err;
- /* for access checks, reg->off is just part of off */
- off += reg->off;
-
if (reg->type == PTR_TO_MAP_KEY) {
if (t == BPF_WRITE) {
verbose(env, "write to change key R%d not allowed\n", regno);
@@ -7703,6 +6395,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
copy_register_state(&regs[value_regno], reg);
+ add_scalar_to_reg(&regs[value_regno], off);
regs[value_regno].type = PTR_TO_INSN;
} else {
mark_reg_unknown(env, regs, value_regno);
@@ -7740,12 +6433,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- struct bpf_retval_range range;
struct bpf_insn_access_aux info = {
.reg_type = SCALAR_VALUE,
.is_ldsx = is_ldsx,
.log = &env->log,
};
+ struct bpf_retval_range range;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -7753,13 +6446,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_ptr_off_reg(env, reg, regno);
- if (err < 0)
- return err;
-
- err = check_ctx_access(env, insn_idx, off, size, t, &info);
- if (err)
- verbose_linfo(env, insn_idx, "; ");
+ err = check_ctx_access(env, insn_idx, regno, off, size, t, &info);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
* PTR_TO_PACKET[_META,_END]. In the latter
@@ -7851,7 +6538,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (reg->type == CONST_PTR_TO_MAP) {
err = check_ptr_to_map_access(env, regs, regno, off, size, t,
value_regno);
- } else if (base_type(reg->type) == PTR_TO_BUF) {
+ } else if (base_type(reg->type) == PTR_TO_BUF &&
+ !type_may_be_null(reg->type)) {
bool rdonly_mem = type_is_rdonly_mem(reg->type);
u32 *max_access;
@@ -8122,8 +6810,6 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
* on the access type and privileges, that all elements of the stack are
* initialized.
*
- * 'off' includes 'regno->off', but not its dynamic part (if any).
- *
* All registers that have been spilled on the stack in the slots within the
* read offsets are marked as read.
*/
@@ -8133,21 +6819,27 @@ static int check_stack_range_initialized(
enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
/* Some accesses can write anything into the stack, others are
* read-only.
*/
- bool clobber = false;
+ bool clobber = type == BPF_WRITE;
+ /*
+ * Negative access_size signals global subprog/kfunc arg check where
+ * STACK_POISON slots are acceptable. static stack liveness
+ * might have determined that subprog doesn't read them,
+ * but BTF based global subprog validation isn't accurate enough.
+ */
+ bool allow_poison = access_size < 0 || clobber;
+
+ access_size = abs(access_size);
if (access_size == 0 && !zero_size_allowed) {
verbose(env, "invalid zero-sized read\n");
return -EACCES;
}
- if (type == BPF_WRITE)
- clobber = true;
-
err = check_stack_access_within_bounds(env, regno, off, access_size, type);
if (err)
return err;
@@ -8199,7 +6891,7 @@ static int check_stack_range_initialized(
for (i = min_off; i < max_off + access_size; i++) {
int stack_off = -i - 1;
- spi = __get_spi(i);
+ spi = bpf_get_spi(i);
/* raw_mode may write past allocated_stack */
if (state->allocated_stack <= stack_off)
continue;
@@ -8235,7 +6927,7 @@ static int check_stack_range_initialized(
goto mark;
}
- if (is_spilled_reg(&state->stack[spi]) &&
+ if (bpf_is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
if (clobber) {
@@ -8246,7 +6938,12 @@ static int check_stack_range_initialized(
goto mark;
}
- if (tnum_is_const(reg->var_off)) {
+ if (*stype == STACK_POISON) {
+ if (allow_poison)
+ goto mark;
+ verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n",
+ regno, min_off, i - min_off, access_size);
+ } else if (tnum_is_const(reg->var_off)) {
verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
regno, min_off, i - min_off, access_size);
} else {
@@ -8258,17 +6955,7 @@ static int check_stack_range_initialized(
}
return -EACCES;
mark:
- /* reading any byte out of 8-byte 'spill_slot' will cause
- * the whole slot to be marked as 'read'
- */
- err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi));
- if (err)
- return err;
- /* We do not call bpf_mark_stack_write(), as we can not
- * be sure that whether stack slot is written to or not. Hence,
- * we must still conservatively propagate reads upwards even if
- * helper may write to the entire memory range.
- */
+ ;
}
return 0;
}
@@ -8284,7 +6971,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
switch (base_type(reg->type)) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
- return check_packet_access(env, regno, reg->off, access_size,
+ return check_packet_access(env, regno, 0, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
if (access_type == BPF_WRITE) {
@@ -8292,12 +6979,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
reg_type_str(env, reg->type));
return -EACCES;
}
- return check_mem_region_access(env, regno, reg->off, access_size,
+ return check_mem_region_access(env, regno, 0, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
- if (check_map_access_type(env, regno, reg->off, access_size, access_type))
+ if (check_map_access_type(env, regno, 0, access_size, access_type))
return -EACCES;
- return check_map_access(env, regno, reg->off, access_size,
+ return check_map_access(env, regno, 0, access_size,
zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
if (type_is_rdonly_mem(reg->type)) {
@@ -8307,7 +6994,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
}
- return check_mem_region_access(env, regno, reg->off,
+ return check_mem_region_access(env, regno, 0,
access_size, reg->mem_size,
zero_size_allowed);
case PTR_TO_BUF:
@@ -8322,39 +7009,33 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
} else {
max_access = &env->prog->aux->max_rdwr_access;
}
- return check_buffer_access(env, reg, regno, reg->off,
+ return check_buffer_access(env, reg, regno, 0,
access_size, zero_size_allowed,
max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
- regno, reg->off, access_size,
+ regno, 0, access_size,
zero_size_allowed, access_type, meta);
case PTR_TO_BTF_ID:
- return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ return check_ptr_to_btf_access(env, regs, regno, 0,
access_size, BPF_READ, -1);
case PTR_TO_CTX:
- /* in case the function doesn't know how to access the context,
- * (because we are in a program of type SYSCALL for example), we
- * can not statically check its size.
- * Dynamically check it now.
- */
- if (!env->ops->convert_ctx_access) {
- int offset = access_size - 1;
-
- /* Allow zero-byte read from PTR_TO_CTX */
- if (access_size == 0)
- return zero_size_allowed ? 0 : -EACCES;
-
- return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
- access_type, -1, false, false);
+ /* Only permit reading or writing syscall context using helper calls. */
+ if (is_var_ctx_off_allowed(env->prog)) {
+ int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX,
+ zero_size_allowed);
+ if (err)
+ return err;
+ if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size)
+ env->prog->aux->max_ctx_offset = reg->umax_value + access_size;
+ return 0;
}
-
fallthrough;
default: /* scalar_value or invalid ptr */
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
- register_is_null(reg))
+ bpf_register_is_null(reg))
return 0;
verbose(env, "R%d type=%s ", regno,
@@ -8427,7 +7108,7 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
struct bpf_reg_state saved_reg;
int err;
- if (register_is_null(reg))
+ if (bpf_register_is_null(reg))
return 0;
/* Assuming that the register contains a value check if the memory
@@ -8439,8 +7120,10 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
mark_ptr_not_null_reg(reg);
}
- err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL);
- err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL);
+ int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size;
+
+ err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL);
+ err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL);
if (may_be_null)
*reg = saved_reg;
@@ -8543,9 +7226,9 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
return -EINVAL;
}
spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off;
- if (spin_lock_off != val + reg->off) {
+ if (spin_lock_off != val) {
verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n",
- val + reg->off, lock_str, spin_lock_off);
+ val, lock_str, spin_lock_off);
return -EINVAL;
}
if (is_lock) {
@@ -8660,9 +7343,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
return -EINVAL;
}
- if (field_off != val + reg->off) {
+ if (field_off != val) {
verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n",
- val + reg->off, struct_name, field_off);
+ val, struct_name, field_off);
return -EINVAL;
}
if (map_desc->ptr) {
@@ -8730,7 +7413,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
- kptr_off = reg->off + reg->var_off.value;
+ kptr_off = reg->var_off.value;
kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
@@ -8851,7 +7534,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
return state->stack[spi].spilled_ptr.ref_obj_id;
}
@@ -8866,10 +7549,6 @@ static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_ITER_NEW;
}
-static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
-{
- return meta->kfunc_flags & KF_ITER_NEXT;
-}
static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -8987,7 +7666,7 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
struct list_head *pos, *head;
/* Explored states are pushed in stack order, most recent states come first */
- head = explored_state(env, insn_idx);
+ head = bpf_explored_state(env, insn_idx);
list_for_each(pos, head) {
sl = container_of(pos, struct bpf_verifier_state_list, node);
/* If st->branches != 0 state is a part of current DFS verification path,
@@ -9002,11 +7681,6 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
return NULL;
}
-static void reset_idmap_scratch(struct bpf_verifier_env *env);
-static bool regs_exact(const struct bpf_reg_state *rold,
- const struct bpf_reg_state *rcur,
- struct bpf_idmap *idmap);
-
/*
* Check if scalar registers are exact for the purpose of not widening.
* More lenient than regs_exact()
@@ -9048,8 +7722,8 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
fcur->allocated_stack / BPF_REG_SIZE);
for (i = 0; i < num_slots; i++) {
- if (!is_spilled_reg(&fold->stack[i]) ||
- !is_spilled_reg(&fcur->stack[i]))
+ if (!bpf_is_spilled_reg(&fold->stack[i]) ||
+ !bpf_is_spilled_reg(&fcur->stack[i]))
continue;
maybe_widen_reg(env,
@@ -9293,6 +7967,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_CTX,
},
};
@@ -9329,7 +8004,9 @@ static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE }
static const struct bpf_reg_types kptr_xchg_dest_types = {
.types = {
PTR_TO_MAP_VALUE,
- PTR_TO_BTF_ID | MEM_ALLOC
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF,
+ PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU,
}
};
static const struct bpf_reg_types dynptr_types = {
@@ -9373,7 +8050,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
struct bpf_reg_state *reg = reg_state(env, regno);
enum bpf_reg_type expected, type = reg->type;
const struct bpf_reg_types *compatible;
- int i, j;
+ int i, j, err;
compatible = compatible_reg_types[base_type(arg_type)];
if (!compatible) {
@@ -9476,8 +8153,12 @@ found:
return -EACCES;
}
- if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- btf_vmlinux, *arg_btf_id,
+ err = __check_ptr_off_reg(env, reg, regno, true);
+ if (err)
+ return err;
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id,
+ reg->var_off.value, btf_vmlinux, *arg_btf_id,
strict_type_match)) {
verbose(env, "R%d is of type %s but %s is expected\n",
regno, btf_type_name(reg->btf, reg->btf_id),
@@ -9489,6 +8170,8 @@ found:
}
case PTR_TO_BTF_ID | MEM_ALLOC:
case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
meta->func_id != BPF_FUNC_kptr_xchg) {
verifier_bug(env, "unimplemented handling of MEM_ALLOC");
@@ -9555,12 +8238,11 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
* because fixed_off_ok is false, but checking here allows us
* to give the user a better error message.
*/
- if (reg->off) {
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) {
verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
regno);
return -EINVAL;
}
- return __check_ptr_off_reg(env, reg, regno, false);
}
switch (type) {
@@ -9595,6 +8277,16 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
* still need to do checks instead of returning.
*/
return __check_ptr_off_reg(env, reg, regno, true);
+ case PTR_TO_CTX:
+ /*
+ * Allow fixed and variable offsets for syscall context, but
+ * only when the argument is passed as memory, not ctx,
+ * otherwise we may get modified ctx in tail called programs and
+ * global subprogs (that may act as extension prog hooks).
+ */
+ if (arg_type != ARG_PTR_TO_CTX && is_var_ctx_off_allowed(env->prog))
+ return 0;
+ fallthrough;
default:
return __check_ptr_off_reg(env, reg, regno, false);
}
@@ -9624,7 +8316,7 @@ static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi;
if (reg->type == CONST_PTR_TO_DYNPTR)
@@ -9637,7 +8329,7 @@ static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi;
if (reg->type == CONST_PTR_TO_DYNPTR)
@@ -9651,13 +8343,13 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state
static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi;
if (reg->type == CONST_PTR_TO_DYNPTR)
return reg->dynptr.type;
- spi = __get_spi(reg->off);
+ spi = bpf_get_spi(reg->var_off.value);
if (spi < 0) {
verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
return BPF_DYNPTR_TYPE_INVALID;
@@ -9698,13 +8390,13 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
return -EACCES;
}
- err = check_map_access(env, regno, reg->off,
- map->value_size - reg->off, false,
+ err = check_map_access(env, regno, 0,
+ map->value_size - reg->var_off.value, false,
ACCESS_HELPER);
if (err)
return err;
- map_off = reg->off + reg->var_off.value;
+ map_off = reg->var_off.value;
err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
if (err) {
verbose(env, "direct value access on string failed\n");
@@ -9725,7 +8417,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
u32 key_size,
s64 *value)
{
- struct bpf_func_state *state = func(env, key);
+ struct bpf_func_state *state = bpf_func(env, key);
struct bpf_reg_state *reg;
int slot, spi, off;
int spill_size = 0;
@@ -9741,7 +8433,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
if (!tnum_is_const(key->var_off))
return -EOPNOTSUPP;
- stack_off = key->off + key->var_off.value;
+ stack_off = key->var_off.value;
slot = -stack_off - 1;
spi = slot / BPF_REG_SIZE;
off = slot % BPF_REG_SIZE;
@@ -9756,7 +8448,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
}
/* Check that stack contains a scalar spill of expected size */
- if (!is_spilled_scalar_reg(&state->stack[spi]))
+ if (!bpf_is_spilled_scalar_reg(&state->stack[spi]))
return -EOPNOTSUPP;
for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
spill_size++;
@@ -9771,7 +8463,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
/* We are relying on a constant value. So mark as precise
* to prevent pruning on it.
*/
- bt_set_frame_slot(&env->bt, key->frameno, spi);
+ bpf_bt_set_frame_slot(&env->bt, key->frameno, spi);
err = mark_chain_precision_batch(env, env->cur_state);
if (err < 0)
return err;
@@ -9823,7 +8515,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return err;
}
- if (register_is_null(reg) && type_may_be_null(arg_type))
+ if (bpf_register_is_null(reg) && type_may_be_null(arg_type))
/* A NULL register has a SCALAR_VALUE type, so skip
* type checking.
*/
@@ -9845,7 +8537,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
skip_type_check:
if (arg_type_is_release(arg_type)) {
if (arg_type_is_dynptr(arg_type)) {
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi;
/* Only dynptr created on stack can be released, thus
@@ -9863,7 +8555,7 @@ skip_type_check:
verbose(env, "cannot release unowned const bpf_dynptr\n");
return -EINVAL;
}
- } else if (!reg->ref_obj_id && !register_is_null(reg)) {
+ } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) {
verbose(env, "R%d must be referenced when passed to release function\n",
regno);
return -EINVAL;
@@ -9942,7 +8634,7 @@ skip_type_check:
}
break;
case ARG_PTR_TO_MAP_VALUE:
- if (type_may_be_null(arg_type) && register_is_null(reg))
+ if (type_may_be_null(arg_type) && bpf_register_is_null(reg))
return 0;
/* bpf_map_xxx(..., map_ptr, ..., value) call:
@@ -10090,7 +8782,7 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
return false;
}
-static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
+bool bpf_allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
{
return env->prog->jit_requested &&
bpf_jit_supports_subprog_tailcalls();
@@ -10235,7 +8927,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_tail_call:
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
- if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
+ if (env->subprog_cnt > 1 && !bpf_allow_tail_call_in_subprogs(env)) {
verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n");
return -EINVAL;
}
@@ -10547,7 +9239,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
/* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, regs, caller_saved[i]);
+ bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
__check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
}
}
@@ -10642,7 +9334,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
* invalid memory access.
*/
} else if (arg->arg_type == ARG_PTR_TO_CTX) {
- ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX);
if (ret < 0)
return ret;
/* If function expects ctx type in BTF check that caller
@@ -10686,7 +9378,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
struct bpf_call_arg_meta meta;
int err;
- if (register_is_null(reg) && type_may_be_null(arg->arg_type))
+ if (bpf_register_is_null(reg) && type_may_be_null(arg->arg_type))
continue;
memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
@@ -10768,7 +9460,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
return -EFAULT;
}
- if (is_async_callback_calling_insn(insn)) {
+ if (bpf_is_async_callback_calling_insn(insn)) {
struct bpf_verifier_state *async_cb;
/* there is no real recursion here. timer and workqueue callbacks are async */
@@ -10815,7 +9507,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int err, subprog, target_insn;
target_insn = *insn_idx + insn->imm + 1;
- subprog = find_subprog(env, target_insn);
+ subprog = bpf_find_subprog(env, target_insn);
if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
target_insn))
return -EFAULT;
@@ -10824,7 +9516,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
- if (subprog_is_global(env, subprog)) {
+ if (bpf_subprog_is_global(env, subprog)) {
const char *sub_name = subprog_name(env, subprog);
if (env->cur_state->active_locks) {
@@ -10833,12 +9525,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
- if (env->subprog_info[subprog].might_sleep &&
- (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks ||
- env->cur_state->active_irq_id || !in_sleepable(env))) {
- verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
- "i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
- "a non-sleepable BPF program context\n");
+ if (env->subprog_info[subprog].might_sleep && !in_sleepable_context(env)) {
+ verbose(env, "sleepable global function %s() called in %s\n",
+ sub_name, non_sleepable_context_description(env));
return -EINVAL;
}
@@ -10857,9 +9546,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
- /* All global functions return a 64-bit SCALAR_VALUE */
- mark_reg_unknown(env, caller->regs, BPF_REG_0);
- caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ /* All non-void global functions return a 64-bit SCALAR_VALUE. */
+ if (!subprog_returns_void(env, subprog)) {
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ }
/* continue with next insn after call */
return 0;
@@ -10877,8 +9568,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
- bpf_reset_live_stack_callchain(env);
-
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
print_verifier_state(env, state, caller->frameno, true);
@@ -10912,7 +9601,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env,
callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
return 0;
}
@@ -10969,9 +9658,9 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
callee->callback_ret_range = retval_range(0, 1);
@@ -11001,8 +9690,8 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_3].map_ptr = map_ptr;
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
callee->callback_ret_range = retval_range(0, 0);
return 0;
@@ -11029,8 +9718,8 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
callee->callback_ret_range = retval_range(0, 1);
return 0;
@@ -11045,14 +9734,14 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
* callback_ctx, u64 flags);
* callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
*/
- __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
callee->callback_ret_range = retval_range(0, 1);
@@ -11073,7 +9762,8 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
*/
struct btf_field *field;
- field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
+ field = reg_find_field_offset(&caller->regs[BPF_REG_1],
+ caller->regs[BPF_REG_1].var_off.value,
BPF_RB_ROOT);
if (!field || !field->graph_root.value_btf_id)
return -EFAULT;
@@ -11083,9 +9773,9 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
callee->callback_ret_range = retval_range(0, 1);
return 0;
@@ -11114,8 +9804,8 @@ static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_3].map_ptr = map_ptr;
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
callee->callback_ret_range = retval_range(S32_MIN, S32_MAX);
return 0;
@@ -11146,10 +9836,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}
-static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg,
- bool return_32bit)
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
{
- if (return_32bit)
+ if (range.return_32bit)
return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
else
return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
@@ -11163,10 +9852,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
bool in_callback_fn;
int err;
- err = bpf_update_live_stack(env);
- if (err)
- return err;
-
callee = state->frame[state->curframe];
r0 = &callee->regs[BPF_REG_0];
if (r0->type == PTR_TO_STACK) {
@@ -11193,7 +9878,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return err;
/* enforce R0 return value range, and bpf_callback_t returns 64bit */
- if (!retval_range_within(callee->callback_ret_range, r0, false)) {
+ if (!retval_range_within(callee->callback_ret_range, r0)) {
verbose_invalid_scalar(env, r0, callee->callback_ret_range,
"At callback return", "R0");
return -EINVAL;
@@ -11449,7 +10134,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
/* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
* and map_direct_value_addr is set.
*/
- fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
+ fmt_map_off = fmt_reg->var_off.value;
err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
fmt_map_off);
if (err) {
@@ -11475,7 +10160,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
if (type == BPF_PROG_TYPE_TRACING) {
if (!bpf_prog_has_trampoline(env->prog)) {
- verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+ verbose(env, "func %s#%d supported only for fentry/fexit/fsession/fmod_ret programs\n",
func_id_name(func_id), func_id);
return -ENOTSUPP;
}
@@ -11497,7 +10182,7 @@ static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
static bool loop_flag_is_zero(struct bpf_verifier_env *env)
{
struct bpf_reg_state *reg = reg_state(env, BPF_REG_4);
- bool reg_is_null = register_is_null(reg);
+ bool reg_is_null = bpf_register_is_null(reg);
if (reg_is_null)
mark_chain_precision(env, BPF_REG_4);
@@ -11538,8 +10223,8 @@ static bool can_elide_value_nullness(enum bpf_map_type type)
}
}
-static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
- const struct bpf_func_proto **ptr)
+int bpf_get_helper_proto(struct bpf_verifier_env *env, int func_id,
+ const struct bpf_func_proto **ptr)
{
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID)
return -ERANGE;
@@ -11561,6 +10246,19 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env)
in_sleepable(env);
}
+static const char *non_sleepable_context_description(struct bpf_verifier_env *env)
+{
+ if (env->cur_state->active_rcu_locks)
+ return "rcu_read_lock region";
+ if (env->cur_state->active_preempt_locks)
+ return "non-preemptible region";
+ if (env->cur_state->active_irq_id)
+ return "IRQ-disabled region";
+ if (env->cur_state->active_locks)
+ return "lock region";
+ return "non-sleepable prog";
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -11577,7 +10275,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* find function prototype */
func_id = insn->imm;
- err = get_helper_proto(env, insn->imm, &fn);
+ err = bpf_get_helper_proto(env, insn->imm, &fn);
if (err == -ERANGE) {
verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
@@ -11600,11 +10298,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
- if (!in_sleepable(env) && fn->might_sleep) {
- verbose(env, "helper call might sleep in a non-sleepable prog\n");
- return -EINVAL;
- }
-
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(func_id);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
@@ -11621,28 +10314,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (env->cur_state->active_rcu_locks) {
- if (fn->might_sleep) {
- verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
- func_id_name(func_id), func_id);
- return -EINVAL;
- }
- }
-
- if (env->cur_state->active_preempt_locks) {
- if (fn->might_sleep) {
- verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
- func_id_name(func_id), func_id);
- return -EINVAL;
- }
- }
-
- if (env->cur_state->active_irq_id) {
- if (fn->might_sleep) {
- verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
- func_id_name(func_id), func_id);
- return -EINVAL;
- }
+ if (fn->might_sleep && !in_sleepable_context(env)) {
+ verbose(env, "sleepable helper %s#%d in %s\n", func_id_name(func_id), func_id,
+ non_sleepable_context_description(env));
+ return -EINVAL;
}
/* Track non-sleepable context for helpers. */
@@ -11703,7 +10378,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
} else if (meta.ref_obj_id) {
err = release_reference(env, meta.ref_obj_id);
- } else if (register_is_null(&regs[meta.release_regno])) {
+ } else if (bpf_register_is_null(&regs[meta.release_regno])) {
/* meta.ref_obj_id can only be 0 if register that is meant to be
* released is NULL, which must be > R0.
*/
@@ -11726,7 +10401,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* check that flags argument in get_local_storage(map, flags) is 0,
* this is required because get_local_storage() can't return an error.
*/
- if (!register_is_null(&regs[BPF_REG_2])) {
+ if (!bpf_register_is_null(&regs[BPF_REG_2])) {
verbose(env, "get_local_storage() doesn't support non-zero flags\n");
return -EINVAL;
}
@@ -11869,7 +10544,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, regs, caller_saved[i]);
+ bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -12131,10 +10806,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RELEASE;
}
-static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
-{
- return meta->kfunc_flags & KF_SLEEPABLE;
-}
static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -12355,6 +11026,28 @@ static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID);
}
+/*
+ * A kfunc with KF_IMPLICIT_ARGS has two prototypes in BTF:
+ * - the _impl prototype with full arg list (meta->func_proto)
+ * - the BPF API prototype w/o implicit args (func->type in BTF)
+ * To determine whether an argument is implicit, we compare its position
+ * against the number of arguments in the prototype w/o implicit args.
+ */
+static bool is_kfunc_arg_implicit(const struct bpf_kfunc_call_arg_meta *meta, u32 arg_idx)
+{
+ const struct btf_type *func, *func_proto;
+ u32 argn;
+
+ if (!(meta->kfunc_flags & KF_IMPLICIT_ARGS))
+ return false;
+
+ func = btf_type_by_id(meta->btf, meta->func_id);
+ func_proto = btf_type_by_id(meta->btf, func->type);
+ argn = btf_type_vlen(func_proto);
+
+ return argn <= arg_idx;
+}
+
/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
const struct btf *btf,
@@ -12421,10 +11114,15 @@ enum kfunc_ptr_arg_type {
enum special_kfunc_type {
KF_bpf_obj_new_impl,
+ KF_bpf_obj_new,
KF_bpf_obj_drop_impl,
+ KF_bpf_obj_drop,
KF_bpf_refcount_acquire_impl,
+ KF_bpf_refcount_acquire,
KF_bpf_list_push_front_impl,
+ KF_bpf_list_push_front,
KF_bpf_list_push_back_impl,
+ KF_bpf_list_push_back,
KF_bpf_list_pop_front,
KF_bpf_list_pop_back,
KF_bpf_list_front,
@@ -12435,6 +11133,7 @@ enum special_kfunc_type {
KF_bpf_rcu_read_unlock,
KF_bpf_rbtree_remove,
KF_bpf_rbtree_add_impl,
+ KF_bpf_rbtree_add,
KF_bpf_rbtree_first,
KF_bpf_rbtree_root,
KF_bpf_rbtree_left,
@@ -12447,7 +11146,9 @@ enum special_kfunc_type {
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
KF_bpf_percpu_obj_new_impl,
+ KF_bpf_percpu_obj_new,
KF_bpf_percpu_obj_drop_impl,
+ KF_bpf_percpu_obj_drop,
KF_bpf_throw,
KF_bpf_wq_set_callback,
KF_bpf_preempt_disable,
@@ -12481,10 +11182,15 @@ enum special_kfunc_type {
BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_new)
BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_obj_drop)
BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_refcount_acquire)
BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_front)
BTF_ID(func, bpf_list_push_back_impl)
+BTF_ID(func, bpf_list_push_back)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_list_front)
@@ -12495,6 +11201,7 @@ BTF_ID(func, bpf_rcu_read_lock)
BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
+BTF_ID(func, bpf_rbtree_add)
BTF_ID(func, bpf_rbtree_first)
BTF_ID(func, bpf_rbtree_root)
BTF_ID(func, bpf_rbtree_left)
@@ -12514,7 +11221,9 @@ BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_new)
BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_percpu_obj_drop)
BTF_ID(func, bpf_throw)
BTF_ID(func, bpf_wq_set_callback)
BTF_ID(func, bpf_preempt_disable)
@@ -12558,6 +11267,50 @@ BTF_ID(func, bpf_session_is_return)
BTF_ID(func, bpf_stream_vprintk)
BTF_ID(func, bpf_stream_print_stack)
+static bool is_bpf_obj_new_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_obj_new] ||
+ func_id == special_kfunc_list[KF_bpf_obj_new_impl];
+}
+
+static bool is_bpf_percpu_obj_new_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_percpu_obj_new] ||
+ func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl];
+}
+
+static bool is_bpf_obj_drop_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_obj_drop] ||
+ func_id == special_kfunc_list[KF_bpf_obj_drop_impl];
+}
+
+static bool is_bpf_percpu_obj_drop_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_percpu_obj_drop] ||
+ func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl];
+}
+
+static bool is_bpf_refcount_acquire_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_refcount_acquire] ||
+ func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+}
+
+static bool is_bpf_list_push_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ func_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ func_id == special_kfunc_list[KF_bpf_list_push_back_impl];
+}
+
+static bool is_bpf_rbtree_add_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_rbtree_add] ||
+ func_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+}
+
static bool is_task_work_add_kfunc(u32 func_id)
{
return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
@@ -12566,10 +11319,8 @@ static bool is_task_work_add_kfunc(u32 func_id)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
- if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
- meta->arg_owning_ref) {
+ if (is_bpf_refcount_acquire_kfunc(meta->func_id) && meta->arg_owning_ref)
return false;
- }
return meta->kfunc_flags & KF_RET_NULL;
}
@@ -12594,7 +11345,7 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
}
-static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
+bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data];
}
@@ -12629,7 +11380,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) &&
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) &&
!arg_mem_size)
return KF_ARG_PTR_TO_NULL;
@@ -12755,13 +11506,12 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
- WARN_ON_ONCE(is_kfunc_release(meta) &&
- (reg->off || !tnum_is_const(reg->var_off) ||
- reg->var_off.value));
+ WARN_ON_ONCE(is_kfunc_release(meta) && !tnum_is_const(reg->var_off));
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
- struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match);
+ struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->var_off.value,
+ meta->btf, ref_id, strict_type_match);
/* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
* actually use it -- it must cast to the underlying type. So we allow
* caller to pass in the underlying type.
@@ -12958,8 +11708,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
static bool is_bpf_list_api_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
- btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ return is_bpf_list_push_kfunc(btf_id) ||
btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
btf_id == special_kfunc_list[KF_bpf_list_front] ||
@@ -12968,7 +11717,7 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ return is_bpf_rbtree_add_kfunc(btf_id) ||
btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
@@ -12985,8 +11734,9 @@ static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
static bool is_bpf_graph_api_kfunc(u32 btf_id)
{
- return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
- btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+ return is_bpf_list_api_kfunc(btf_id) ||
+ is_bpf_rbtree_api_kfunc(btf_id) ||
+ is_bpf_refcount_acquire_kfunc(btf_id);
}
static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
@@ -13019,7 +11769,7 @@ static bool kfunc_spin_allowed(u32 btf_id)
static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+ return is_bpf_rbtree_add_kfunc(btf_id);
}
static bool is_async_callback_calling_kfunc(u32 btf_id)
@@ -13083,12 +11833,11 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
switch (node_field_type) {
case BPF_LIST_NODE:
- ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
+ ret = is_bpf_list_push_kfunc(kfunc_btf_id);
break;
case BPF_RB_NODE:
- ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
break;
@@ -13133,7 +11882,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
}
rec = reg_btf_record(reg);
- head_off = reg->off + reg->var_off.value;
+ head_off = reg->var_off.value;
field = btf_record_find(rec, head_off, head_field_type);
if (!field) {
verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
@@ -13200,7 +11949,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
return -EINVAL;
}
- node_off = reg->off + reg->var_off.value;
+ node_off = reg->var_off.value;
field = reg_find_field_offset(reg, node_off, node_field_type);
if (!field) {
verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
@@ -13305,11 +12054,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
bool is_ret_buf_sz = false;
int kf_arg_type;
- t = btf_type_skip_modifiers(btf, args[i].type, NULL);
-
- if (is_kfunc_arg_ignore(btf, &args[i]))
- continue;
-
if (is_kfunc_arg_prog_aux(btf, &args[i])) {
/* Reject repeated use bpf_prog_aux */
if (meta->arg_prog) {
@@ -13321,6 +12065,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
continue;
}
+ if (is_kfunc_arg_ignore(btf, &args[i]) || is_kfunc_arg_implicit(meta, i))
+ continue;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
if (btf_type_is_scalar(t)) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "R%d is not a scalar\n", regno);
@@ -13372,7 +12121,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
- if ((register_is_null(reg) || type_may_be_null(reg->type)) &&
+ if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) &&
!is_kfunc_arg_nullable(meta->btf, &args[i])) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES;
@@ -13449,7 +12198,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
}
fallthrough;
- case KF_ARG_PTR_TO_CTX:
case KF_ARG_PTR_TO_DYNPTR:
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
@@ -13467,6 +12215,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_IRQ_FLAG:
case KF_ARG_PTR_TO_RES_SPIN_LOCK:
break;
+ case KF_ARG_PTR_TO_CTX:
+ arg_type = ARG_PTR_TO_CTX;
+ break;
default:
verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type);
return -EFAULT;
@@ -13495,13 +12246,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
- if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
- verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
+ if (!is_bpf_obj_drop_kfunc(meta->func_id)) {
+ verbose(env, "arg#%d expected for bpf_obj_drop()\n", i);
return -EINVAL;
}
} else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
- if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
- verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
+ if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) {
+ verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i);
return -EINVAL;
}
} else {
@@ -13627,7 +12378,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_RB_NODE:
- if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (is_bpf_rbtree_add_kfunc(meta->func_id)) {
if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "arg#%d expected pointer to allocated object\n", i);
return -EINVAL;
@@ -13690,7 +12441,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
struct bpf_reg_state *size_reg = &regs[regno + 1];
const struct btf_param *size_arg = &args[i + 1];
- if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
+ if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
if (ret < 0) {
verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
@@ -13823,10 +12574,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return 0;
}
-static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
- s32 func_id,
- s16 offset,
- struct bpf_kfunc_call_arg_meta *meta)
+int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
+ s32 func_id,
+ s16 offset,
+ struct bpf_kfunc_call_arg_meta *meta)
{
struct bpf_kfunc_meta kfunc;
int err;
@@ -13849,6 +12600,194 @@ static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
return 0;
}
+/*
+ * Determine how many bytes a helper accesses through a stack pointer at
+ * argument position @arg (0-based, corresponding to R1-R5).
+ *
+ * Returns:
+ * > 0 known read access size in bytes
+ * 0 doesn't read anything directly
+ * S64_MIN unknown
+ * < 0 known write access of (-return) bytes
+ */
+s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int arg, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ const struct bpf_func_proto *fn;
+ enum bpf_arg_type at;
+ s64 size;
+
+ if (bpf_get_helper_proto(env, insn->imm, &fn) < 0)
+ return S64_MIN;
+
+ at = fn->arg_type[arg];
+
+ switch (base_type(at)) {
+ case ARG_PTR_TO_MAP_KEY:
+ case ARG_PTR_TO_MAP_VALUE: {
+ bool is_key = base_type(at) == ARG_PTR_TO_MAP_KEY;
+ u64 val;
+ int i, map_reg;
+
+ for (i = 0; i < arg; i++) {
+ if (base_type(fn->arg_type[i]) == ARG_CONST_MAP_PTR)
+ break;
+ }
+ if (i >= arg)
+ goto scan_all_maps;
+
+ map_reg = BPF_REG_1 + i;
+
+ if (!(aux->const_reg_map_mask & BIT(map_reg)))
+ goto scan_all_maps;
+
+ i = aux->const_reg_vals[map_reg];
+ if (i < env->used_map_cnt) {
+ size = is_key ? env->used_maps[i]->key_size
+ : env->used_maps[i]->value_size;
+ goto out;
+ }
+scan_all_maps:
+ /*
+ * Map pointer is not known at this call site (e.g. different
+ * maps on merged paths). Conservatively return the largest
+ * key_size or value_size across all maps used by the program.
+ */
+ val = 0;
+ for (i = 0; i < env->used_map_cnt; i++) {
+ struct bpf_map *map = env->used_maps[i];
+ u32 sz = is_key ? map->key_size : map->value_size;
+
+ if (sz > val)
+ val = sz;
+ if (map->inner_map_meta) {
+ sz = is_key ? map->inner_map_meta->key_size
+ : map->inner_map_meta->value_size;
+ if (sz > val)
+ val = sz;
+ }
+ }
+ if (!val)
+ return S64_MIN;
+ size = val;
+ goto out;
+ }
+ case ARG_PTR_TO_MEM:
+ if (at & MEM_FIXED_SIZE) {
+ size = fn->arg_size[arg];
+ goto out;
+ }
+ if (arg + 1 < ARRAY_SIZE(fn->arg_type) &&
+ arg_type_is_mem_size(fn->arg_type[arg + 1])) {
+ int size_reg = BPF_REG_1 + arg + 1;
+
+ if (aux->const_reg_mask & BIT(size_reg)) {
+ size = (s64)aux->const_reg_vals[size_reg];
+ goto out;
+ }
+ /*
+ * Size arg is const on each path but differs across merged
+ * paths. MAX_BPF_STACK is a safe upper bound for reads.
+ */
+ if (at & MEM_UNINIT)
+ return 0;
+ return MAX_BPF_STACK;
+ }
+ return S64_MIN;
+ case ARG_PTR_TO_DYNPTR:
+ size = BPF_DYNPTR_SIZE;
+ break;
+ case ARG_PTR_TO_STACK:
+ /*
+ * Only used by bpf_calls_callback() helpers. The helper itself
+ * doesn't access stack. The callback subprog does and it's
+ * analyzed separately.
+ */
+ return 0;
+ default:
+ return S64_MIN;
+ }
+out:
+ /*
+ * MEM_UNINIT args are write-only: the helper initializes the
+ * buffer without reading it.
+ */
+ if (at & MEM_UNINIT)
+ return -size;
+ return size;
+}
+
+/*
+ * Determine how many bytes a kfunc accesses through a stack pointer at
+ * argument position @arg (0-based, corresponding to R1-R5).
+ *
+ * Returns:
+ * > 0 known read access size in bytes
+ * 0 doesn't access memory through that argument (ex: not a pointer)
+ * S64_MIN unknown
+ * < 0 known write access of (-return) bytes
+ */
+s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int arg, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ struct bpf_kfunc_call_arg_meta meta;
+ const struct btf_param *args;
+ const struct btf_type *t, *ref_t;
+ const struct btf *btf;
+ u32 nargs, type_size;
+ s64 size;
+
+ if (bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta) < 0)
+ return S64_MIN;
+
+ btf = meta.btf;
+ args = btf_params(meta.func_proto);
+ nargs = btf_type_vlen(meta.func_proto);
+ if (arg >= nargs)
+ return 0;
+
+ t = btf_type_skip_modifiers(btf, args[arg].type, NULL);
+ if (!btf_type_is_ptr(t))
+ return 0;
+
+ /* dynptr: fixed 16-byte on-stack representation */
+ if (is_kfunc_arg_dynptr(btf, &args[arg])) {
+ size = BPF_DYNPTR_SIZE;
+ goto out;
+ }
+
+ /* ptr + __sz/__szk pair: size is in the next register */
+ if (arg + 1 < nargs &&
+ (btf_param_match_suffix(btf, &args[arg + 1], "__sz") ||
+ btf_param_match_suffix(btf, &args[arg + 1], "__szk"))) {
+ int size_reg = BPF_REG_1 + arg + 1;
+
+ if (aux->const_reg_mask & BIT(size_reg)) {
+ size = (s64)aux->const_reg_vals[size_reg];
+ goto out;
+ }
+ return MAX_BPF_STACK;
+ }
+
+ /* fixed-size pointed-to type: resolve via BTF */
+ ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!IS_ERR(btf_resolve_size(btf, ref_t, &type_size))) {
+ size = type_size;
+ goto out;
+ }
+
+ return S64_MIN;
+out:
+ /* KF_ITER_NEW kfuncs initialize the iterator state at arg 0 */
+ if (arg == 0 && meta.kfunc_flags & KF_ITER_NEW)
+ return -size;
+ if (is_kfunc_arg_uninit(btf, &args[arg]))
+ return -size;
+ return size;
+}
+
/* check special kfuncs and return:
* 1 - not fall-through to 'else' branch, continue verification
* 0 - fall-through to 'else' branch
@@ -13864,13 +12803,12 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
if (meta->btf != btf_vmlinux)
return 0;
- if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
- meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (is_bpf_obj_new_kfunc(meta->func_id) || is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
struct btf_struct_meta *struct_meta;
struct btf *ret_btf;
u32 ret_btf_id;
- if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ if (is_bpf_obj_new_kfunc(meta->func_id) && !bpf_global_ma_set)
return -ENOMEM;
if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
@@ -13893,7 +12831,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
return -EINVAL;
}
- if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
@@ -13923,7 +12861,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
}
struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
- if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
return -EINVAL;
@@ -13939,12 +12877,12 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ if (is_bpf_percpu_obj_new_kfunc(meta->func_id))
regs[BPF_REG_0].type |= MEM_PERCPU;
insn_aux->obj_new_size = ret_t->size;
insn_aux->kptr_struct_meta = struct_meta;
- } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ } else if (is_bpf_refcount_acquire_kfunc(meta->func_id)) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
regs[BPF_REG_0].btf = meta->arg_btf;
@@ -14030,6 +12968,8 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
}
static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
+static int process_bpf_exit_full(struct bpf_verifier_env *env,
+ bool *do_print_state, bool exception_exit);
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
@@ -14049,7 +12989,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (!insn->imm)
return 0;
- err = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
+ err = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
if (err == -EACCES && meta.func_name)
verbose(env, "calling kernel function %s is not allowed\n", meta.func_name);
if (err)
@@ -14058,7 +12998,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_name = meta.func_name;
insn_aux = &env->insn_aux_data[insn_idx];
- insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+ insn_aux->is_iter_next = bpf_is_iter_next_kfunc(&meta);
if (!insn->off &&
(insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] ||
@@ -14076,7 +13016,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Clear r0-r5 registers in forked state */
for (i = 0; i < CALLER_SAVED_REGS; i++)
- mark_reg_not_init(env, regs, caller_saved[i]);
+ bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
mark_reg_unknown(env, regs, BPF_REG_0);
err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1);
@@ -14095,7 +13035,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
- sleepable = is_kfunc_sleepable(&meta);
+ sleepable = bpf_is_kfunc_sleepable(&meta);
if (sleepable && !in_sleepable(env)) {
verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
return -EACCES;
@@ -14110,7 +13050,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (err < 0)
return err;
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (is_bpf_rbtree_add_kfunc(meta.func_id)) {
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_rbtree_add_callback_state);
if (err) {
@@ -14170,34 +13110,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}));
}
- } else if (sleepable && env->cur_state->active_rcu_locks) {
- verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
- return -EACCES;
- }
-
- if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
- verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
- return -EACCES;
- }
-
- if (env->cur_state->active_preempt_locks) {
- if (preempt_disable) {
- env->cur_state->active_preempt_locks++;
- } else if (preempt_enable) {
- env->cur_state->active_preempt_locks--;
- } else if (sleepable) {
- verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
- return -EACCES;
- }
} else if (preempt_disable) {
env->cur_state->active_preempt_locks++;
} else if (preempt_enable) {
- verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
- return -EINVAL;
+ if (env->cur_state->active_preempt_locks == 0) {
+ verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+ env->cur_state->active_preempt_locks--;
}
- if (env->cur_state->active_irq_id && sleepable) {
- verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name);
+ if (sleepable && !in_sleepable_context(env)) {
+ verbose(env, "kernel func %s is sleepable within %s\n",
+ func_name, non_sleepable_context_description(env));
+ return -EACCES;
+ }
+
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
return -EACCES;
}
@@ -14224,11 +13154,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return err;
}
- if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) {
release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
- insn_aux->insert_off = regs[BPF_REG_2].off;
+ insn_aux->insert_off = regs[BPF_REG_2].var_off.value;
insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
err = ref_convert_owning_non_owning(env, release_ref_obj_id);
if (err) {
@@ -14266,7 +13194,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
for (i = 0; i < CALLER_SAVED_REGS; i++) {
u32 regno = caller_saved[i];
- mark_reg_not_init(env, regs, regno);
+ bpf_mark_reg_not_init(env, &regs[regno]);
regs[regno].subreg_def = DEF_NOT_SUBREG;
}
@@ -14274,11 +13202,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
- /* Only exception is bpf_obj_new_impl */
if (meta.btf != btf_vmlinux ||
- (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
- meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
- meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
+ (!is_bpf_obj_new_kfunc(meta.func_id) &&
+ !is_bpf_percpu_obj_new_kfunc(meta.func_id) &&
+ !is_bpf_refcount_acquire_kfunc(meta.func_id))) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
return -EINVAL;
}
@@ -14338,7 +13265,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
type |= PTR_UNTRUSTED;
else if (is_kfunc_rcu_protected(&meta) ||
- (is_iter_next_kfunc(&meta) &&
+ (bpf_is_iter_next_kfunc(&meta) &&
(get_iter_from_state(env->cur_state, &meta)
->type & MEM_RCU))) {
/*
@@ -14389,8 +13316,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].id = ++env->id_gen;
} else if (btf_type_is_void(t)) {
if (meta.btf == btf_vmlinux) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ if (is_bpf_obj_drop_kfunc(meta.func_id) ||
+ is_bpf_percpu_obj_drop_kfunc(meta.func_id)) {
insn_aux->kptr_struct_meta =
btf_find_struct_meta(meta.arg_btf,
meta.arg_btf_id);
@@ -14398,7 +13325,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
- if (is_kfunc_pkt_changing(&meta))
+ if (bpf_is_kfunc_pkt_changing(&meta))
clear_all_pkt_pointers(env);
nargs = btf_type_vlen(meta.func_proto);
@@ -14410,11 +13337,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (btf_type_is_ptr(t))
mark_btf_func_reg_size(env, regno, sizeof(void *));
else
- /* scalar. ensured by btf_check_kfunc_arg_match() */
+ /* scalar. ensured by check_kfunc_args() */
mark_btf_func_reg_size(env, regno, t->size);
}
- if (is_iter_next_kfunc(&meta)) {
+ if (bpf_is_iter_next_kfunc(&meta)) {
err = process_iter_next_call(env, insn_idx, &meta);
if (err)
return err;
@@ -14423,12 +13350,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie])
env->prog->call_session_cookie = true;
+ if (is_bpf_throw_kfunc(insn))
+ return process_bpf_exit_full(env, NULL, true);
+
return 0;
}
-static bool check_reg_sane_offset(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg,
- enum bpf_reg_type type)
+static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
{
bool known = tnum_is_const(reg->var_off);
s64 val = reg->var_off.value;
@@ -14440,12 +13370,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return false;
}
- if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
- verbose(env, "%s pointer offset %d is not allowed\n",
- reg_type_str(env, type), reg->off);
- return false;
- }
-
if (smin == S64_MIN) {
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
reg_type_str(env, type));
@@ -14461,6 +13385,29 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return true;
}
+static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
+{
+ bool known = tnum_is_const(reg->var_off);
+ s64 val = reg->var_off.value;
+ s64 smin = reg->smin_value;
+
+ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+ verbose(env, "%s pointer offset %lld is not allowed\n",
+ reg_type_str(env, type), val);
+ return false;
+ }
+
+ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "%s pointer offset %lld is not allowed\n",
+ reg_type_str(env, type), smin);
+ return false;
+ }
+
+ return true;
+}
+
enum {
REASON_BOUNDS = -1,
REASON_TYPE = -2,
@@ -14482,13 +13429,11 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
* currently prohibited for unprivileged.
*/
max = MAX_BPF_STACK + mask_to_left;
- ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
+ ptr_limit = -ptr_reg->var_off.value;
break;
case PTR_TO_MAP_VALUE:
max = ptr_reg->map_ptr->value_size;
- ptr_limit = (mask_to_left ?
- ptr_reg->smin_value :
- ptr_reg->umax_value) + ptr_reg->off;
+ ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value;
break;
default:
return REASON_TYPE;
@@ -14719,9 +13664,6 @@ static int sanitize_err(struct bpf_verifier_env *env,
* Variable offset is prohibited for unprivileged mode for simplicity since it
* requires corresponding support in Spectre masking for stack ALU. See also
* retrieve_ptr_limit().
- *
- *
- * 'off' includes 'reg->off'.
*/
static int check_stack_access_for_ptr_arithmetic(
struct bpf_verifier_env *env,
@@ -14762,11 +13704,11 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
switch (dst_reg->type) {
case PTR_TO_STACK:
if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
- dst_reg->off + dst_reg->var_off.value))
+ dst_reg->var_off.value))
return -EACCES;
break;
case PTR_TO_MAP_VALUE:
- if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
+ if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
"prohibited for !root\n", dst);
return -EACCES;
@@ -14874,8 +13816,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->type = ptr_reg->type;
dst_reg->id = ptr_reg->id;
- if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
- !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+ if (!check_reg_sane_offset_scalar(env, off_reg, ptr_reg->type) ||
+ !check_reg_sane_offset_ptr(env, ptr_reg, ptr_reg->type))
return -EINVAL;
/* pointer types do not carry 32-bit bounds at the moment. */
@@ -14890,23 +13832,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
switch (opcode) {
case BPF_ADD:
- /* We can take a fixed offset as long as it doesn't overflow
- * the s32 'off' field
- */
- if (known && (ptr_reg->off + smin_val ==
- (s64)(s32)(ptr_reg->off + smin_val))) {
- /* pointer += K. Accumulate it into fixed offset */
- dst_reg->smin_value = smin_ptr;
- dst_reg->smax_value = smax_ptr;
- dst_reg->umin_value = umin_ptr;
- dst_reg->umax_value = umax_ptr;
- dst_reg->var_off = ptr_reg->var_off;
- dst_reg->off = ptr_reg->off + smin_val;
- dst_reg->raw = ptr_reg->raw;
- break;
- }
- /* A new variable offset is created. Note that off_reg->off
- * == 0, since it's a scalar.
+ /*
* dst_reg gets the pointer type and since some positive
* integer value was added to the pointer, give it a new 'id'
* if it's a PTR_TO_PACKET.
@@ -14925,12 +13851,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->umax_value = U64_MAX;
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
- dst_reg->off = ptr_reg->off;
dst_reg->raw = ptr_reg->raw;
if (reg_is_pkt_pointer(ptr_reg)) {
- dst_reg->id = ++env->id_gen;
- /* something was added to pkt_ptr, set range to zero */
- memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
+ if (!known)
+ dst_reg->id = ++env->id_gen;
+ /*
+ * Clear range for unknown addends since we can't know
+ * where the pkt pointer ended up. Also clear AT_PKT_END /
+ * BEYOND_PKT_END from prior comparison as any pointer
+ * arithmetic invalidates them.
+ */
+ if (!known || dst_reg->range < 0)
+ memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
}
break;
case BPF_SUB:
@@ -14949,19 +13881,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst);
return -EACCES;
}
- if (known && (ptr_reg->off - smin_val ==
- (s64)(s32)(ptr_reg->off - smin_val))) {
- /* pointer -= K. Subtract it from fixed offset */
- dst_reg->smin_value = smin_ptr;
- dst_reg->smax_value = smax_ptr;
- dst_reg->umin_value = umin_ptr;
- dst_reg->umax_value = umax_ptr;
- dst_reg->var_off = ptr_reg->var_off;
- dst_reg->id = ptr_reg->id;
- dst_reg->off = ptr_reg->off - smin_val;
- dst_reg->raw = ptr_reg->raw;
- break;
- }
/* A new variable offset is created. If the subtrahend is known
* nonnegative, then any reg->range we had before is still good.
*/
@@ -14981,12 +13900,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->umax_value = umax_ptr - umin_val;
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
- dst_reg->off = ptr_reg->off;
dst_reg->raw = ptr_reg->raw;
if (reg_is_pkt_pointer(ptr_reg)) {
- dst_reg->id = ++env->id_gen;
- /* something was added to pkt_ptr, set range to zero */
- if (smin_val < 0)
+ if (!known)
+ dst_reg->id = ++env->id_gen;
+ /*
+ * Clear range if the subtrahend may be negative since
+ * pkt pointer could move past its bounds. A positive
+ * subtrahend moves it backwards keeping positive range
+ * intact. Also clear AT_PKT_END / BEYOND_PKT_END from
+ * prior comparison as arithmetic invalidates them.
+ */
+ if ((!known && smin_val < 0) || dst_reg->range < 0)
memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
}
break;
@@ -15004,7 +13929,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
- if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+ if (!check_reg_sane_offset_ptr(env, dst_reg, ptr_reg->type))
return -EINVAL;
reg_bounds_sync(dst_reg);
bounds_ret = sanitize_check_bounds(env, insn, dst_reg);
@@ -15856,6 +14781,13 @@ static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *ins
/* Apply bswap if alu64 or switch between big-endian and little-endian machines */
bool need_bswap = alu64 || (to_le == is_big_endian);
+ /*
+ * If the register is mutated, manually reset its scalar ID to break
+ * any existing ties and avoid incorrect bounds propagation.
+ */
+ if (need_bswap || insn->imm == 16 || insn->imm == 32)
+ clear_scalar_id(dst_reg);
+
if (need_bswap) {
if (insn->imm == 16)
dst_reg->var_off = tnum_bswap16(dst_reg->var_off);
@@ -15938,7 +14870,7 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins
else
return 0;
- branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ branch = push_stack(env, env->insn_idx, env->insn_idx, false);
if (IS_ERR(branch))
return PTR_ERR(branch);
@@ -16127,11 +15059,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
int err;
dst_reg = &regs[insn->dst_reg];
- src_reg = NULL;
+ if (BPF_SRC(insn->code) == BPF_X)
+ src_reg = &regs[insn->src_reg];
+ else
+ src_reg = NULL;
- if (dst_reg->type == PTR_TO_ARENA) {
+ /* Case where at least one operand is an arena. */
+ if (dst_reg->type == PTR_TO_ARENA || (src_reg && src_reg->type == PTR_TO_ARENA)) {
struct bpf_insn_aux_data *aux = cur_aux(env);
+ if (dst_reg->type != PTR_TO_ARENA)
+ *dst_reg = *src_reg;
+
+ dst_reg->subreg_def = env->insn_idx + 1;
+
if (BPF_CLASS(insn->code) == BPF_ALU64)
/*
* 32-bit operations zero upper bits automatically.
@@ -16147,7 +15088,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
ptr_reg = dst_reg;
if (BPF_SRC(insn->code) == BPF_X) {
- src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
if (dst_reg->type != SCALAR_VALUE) {
/* Combining two pointers by any ALU op yields
@@ -16230,7 +15170,8 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
*/
if (env->bpf_capable &&
(BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) &&
- dst_reg->id && is_reg_const(src_reg, alu32)) {
+ dst_reg->id && is_reg_const(src_reg, alu32) &&
+ !(BPF_SRC(insn->code) == BPF_X && insn->src_reg == insn->dst_reg)) {
u64 val = reg_const_value(src_reg, alu32);
s32 off;
@@ -16255,21 +15196,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* we cannot accumulate another val into rx->off.
*/
clear_id:
- dst_reg->off = 0;
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
} else {
if (alu32)
dst_reg->id |= BPF_ADD_CONST32;
else
dst_reg->id |= BPF_ADD_CONST64;
- dst_reg->off = off;
+ dst_reg->delta = off;
}
} else {
/*
* Make sure ID is cleared otherwise dst_reg min/max could be
* incorrectly propagated into other registers by sync_linked_regs()
*/
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
}
return 0;
}
@@ -16282,23 +15222,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
int err;
if (opcode == BPF_END || opcode == BPF_NEG) {
- if (opcode == BPF_NEG) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->src_reg != BPF_REG_0 ||
- insn->off != 0 || insn->imm != 0) {
- verbose(env, "BPF_NEG uses reserved fields\n");
- return -EINVAL;
- }
- } else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
- (BPF_CLASS(insn->code) == BPF_ALU64 &&
- BPF_SRC(insn->code) != BPF_TO_LE)) {
- verbose(env, "BPF_END uses reserved fields\n");
- return -EINVAL;
- }
- }
-
/* check src operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
@@ -16311,8 +15234,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- if ((opcode == BPF_NEG || opcode == BPF_END) &&
- regs[insn->dst_reg].type == SCALAR_VALUE) {
+ if (regs[insn->dst_reg].type == SCALAR_VALUE) {
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
err = err ?: adjust_scalar_min_max_vals(env, insn,
&regs[insn->dst_reg],
@@ -16326,38 +15248,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (BPF_CLASS(insn->code) == BPF_ALU) {
- if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
- insn->imm) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
- }
- } else if (insn->off == BPF_ADDR_SPACE_CAST) {
- if (insn->imm != 1 && insn->imm != 1u << 16) {
- verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
- return -EINVAL;
- }
+ if (insn->off == BPF_ADDR_SPACE_CAST) {
if (!env->prog->aux->arena) {
verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
return -EINVAL;
}
- } else {
- if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
- insn->off != 32) || insn->imm) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
- }
}
/* check src operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- } else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
- }
}
/* check dest operand, mark as required later */
@@ -16400,7 +15301,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
if (!no_sext)
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
@@ -16426,7 +15327,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* propagated into src_reg by sync_linked_regs()
*/
if (!is_src_reg_u32)
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
dst_reg->subreg_def = env->insn_idx + 1;
} else {
/* case: W1 = (s8, s16)W2 */
@@ -16436,7 +15337,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
if (!no_sext)
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
dst_reg->subreg_def = env->insn_idx + 1;
coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
}
@@ -16463,28 +15364,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
}
- } else if (opcode > BPF_END) {
- verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
- return -EINVAL;
-
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
- (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
- verbose(env, "BPF_ALU uses reserved fields\n");
- return -EINVAL;
- }
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- } else {
- if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) ||
- (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
- verbose(env, "BPF_ALU uses reserved fields\n");
- return -EINVAL;
- }
}
/* check src2 operand */
@@ -16527,19 +15413,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *reg;
int new_range;
- if (dst_reg->off < 0 ||
- (dst_reg->off == 0 && range_right_open))
+ if (dst_reg->umax_value == 0 && range_right_open)
/* This doesn't give us any range */
return;
- if (dst_reg->umax_value > MAX_PACKET_OFF ||
- dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
+ if (dst_reg->umax_value > MAX_PACKET_OFF)
/* Risk of overflow. For instance, ptr + (1<<63) may be less
* than pkt_end, but that's because it's also less than pkt.
*/
return;
- new_range = dst_reg->off;
+ new_range = dst_reg->umax_value;
if (range_right_open)
new_range++;
@@ -16588,7 +15472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
/* If our ids match, then we must have the same max_value. And we
* don't care about the other reg's fixed offset, since if it's too big
* the range won't allow anything.
- * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->type == type && reg->id == dst_reg->id)
@@ -16597,11 +15481,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
}));
}
+static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32);
+static u8 rev_opcode(u8 opcode);
+
+/*
+ * Learn more information about live branches by simulating refinement on both branches.
+ * regs_refine_cond_op() is sound, so producing ill-formed register bounds for the branch means
+ * that branch is dead.
+ */
+static int simulate_both_branches_taken(struct bpf_verifier_env *env, u8 opcode, bool is_jmp32)
+{
+ /* Fallthrough (FALSE) branch */
+ regs_refine_cond_op(&env->false_reg1, &env->false_reg2, rev_opcode(opcode), is_jmp32);
+ reg_bounds_sync(&env->false_reg1);
+ reg_bounds_sync(&env->false_reg2);
+ /*
+ * If there is a range bounds violation in *any* of the abstract values in either
+ * reg_states in the FALSE branch (i.e. reg1, reg2), the FALSE branch must be dead. Only
+ * TRUE branch will be taken.
+ */
+ if (range_bounds_violation(&env->false_reg1) || range_bounds_violation(&env->false_reg2))
+ return 1;
+
+ /* Jump (TRUE) branch */
+ regs_refine_cond_op(&env->true_reg1, &env->true_reg2, opcode, is_jmp32);
+ reg_bounds_sync(&env->true_reg1);
+ reg_bounds_sync(&env->true_reg2);
+ /*
+ * If there is a range bounds violation in *any* of the abstract values in either
+ * reg_states in the TRUE branch (i.e. true_reg1, true_reg2), the TRUE branch must be dead.
+ * Only FALSE branch will be taken.
+ */
+ if (range_bounds_violation(&env->true_reg1) || range_bounds_violation(&env->true_reg2))
+ return 0;
+
+ /* Both branches are possible, we can't determine which one will be taken. */
+ return -1;
+}
+
/*
* <reg1> <op> <reg2>, currently assuming reg2 is a constant
*/
-static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
- u8 opcode, bool is_jmp32)
+static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1,
+ struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32)
{
struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
@@ -16753,7 +15676,7 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
break;
}
- return -1;
+ return simulate_both_branches_taken(env, opcode, is_jmp32);
}
static int flip_opcode(u32 opcode)
@@ -16824,8 +15747,8 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
* -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
* range [0,10]
*/
-static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
- u8 opcode, bool is_jmp32)
+static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1,
+ struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32)
{
if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
@@ -16863,7 +15786,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
}
/* now deal with two scalars, but not necessarily constants */
- return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
+ return is_scalar_branch_taken(env, reg1, reg2, opcode, is_jmp32);
}
/* Opcode that corresponds to a *false* branch condition.
@@ -16954,8 +15877,8 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
/* u32_min_value is not equal to 0xffffffff at this point,
* because otherwise u32_max_value is 0xffffffff as well,
* in such a case both reg1 and reg2 would be constants,
- * jump would be predicted and reg_set_min_max() won't
- * be called.
+ * jump would be predicted and regs_refine_cond_op()
+ * wouldn't be called.
*
* Same reasoning works for all {u,s}{min,max}{32,64} cases
* below.
@@ -17062,49 +15985,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
}
}
-/* Adjusts the register min/max values in the case that the dst_reg and
- * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
- * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
- * Technically we can do similar adjustments for pointers to the same object,
- * but we don't support that right now.
- */
-static int reg_set_min_max(struct bpf_verifier_env *env,
- struct bpf_reg_state *true_reg1,
- struct bpf_reg_state *true_reg2,
- struct bpf_reg_state *false_reg1,
- struct bpf_reg_state *false_reg2,
- u8 opcode, bool is_jmp32)
+/* Check for invariant violations on the registers for both branches of a condition */
+static int regs_bounds_sanity_check_branches(struct bpf_verifier_env *env)
{
int err;
- /* If either register is a pointer, we can't learn anything about its
- * variable offset from the compare (unless they were a pointer into
- * the same object, but we don't bother with that).
- */
- if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
- return 0;
-
- /* We compute branch direction for same SCALAR_VALUE registers in
- * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET)
- * on the same registers, we don't need to adjust the min/max values.
- */
- if (false_reg1 == false_reg2)
- return 0;
-
- /* fallthrough (FALSE) branch */
- regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
- reg_bounds_sync(false_reg1);
- reg_bounds_sync(false_reg2);
-
- /* jump (TRUE) branch */
- regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
- reg_bounds_sync(true_reg1);
- reg_bounds_sync(true_reg2);
-
- err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
- err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
- err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
- err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
+ err = reg_bounds_sanity_check(env, &env->true_reg1, "true_reg1");
+ err = err ?: reg_bounds_sanity_check(env, &env->true_reg2, "true_reg2");
+ err = err ?: reg_bounds_sanity_check(env, &env->false_reg1, "false_reg1");
+ err = err ?: reg_bounds_sanity_check(env, &env->false_reg2, "false_reg2");
return err;
}
@@ -17114,29 +16003,24 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
{
if (type_may_be_null(reg->type) && reg->id == id &&
(is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
- /* Old offset (both fixed and variable parts) should have been
- * known-zero, because we don't allow pointer arithmetic on
- * pointers that might be NULL. If we see this happening, don't
- * convert the register.
+ /* Old offset should have been known-zero, because we don't
+ * allow pointer arithmetic on pointers that might be NULL.
+ * If we see this happening, don't convert the register.
*
* But in some cases, some helpers that return local kptrs
- * advance offset for the returned pointer. In those cases, it
- * is fine to expect to see reg->off.
+ * advance offset for the returned pointer. In those cases,
+ * it is fine to expect to see reg->var_off.
*/
- if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
- return;
if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
- WARN_ON_ONCE(reg->off))
+ WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0)))
return;
-
if (is_null) {
- reg->type = SCALAR_VALUE;
/* We don't need id and ref_obj_id from this point
* onwards anymore, thus we should better reset it,
* so that state pruning has chances to take effect.
*/
- reg->id = 0;
- reg->ref_obj_id = 0;
+ __mark_reg_known_zero(reg);
+ reg->type = SCALAR_VALUE;
return;
}
@@ -17297,7 +16181,7 @@ static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_st
e->is_reg = is_reg;
e->regno = spi_or_reg;
} else {
- reg->id = 0;
+ clear_scalar_id(reg);
}
}
@@ -17305,22 +16189,29 @@ static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_st
* in verifier state, save R in linked_regs if R->id == id.
* If there are too many Rs sharing same id, reset id for leftover Rs.
*/
-static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
+static void collect_linked_regs(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *vstate,
+ u32 id,
struct linked_regs *linked_regs)
{
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
struct bpf_func_state *func;
struct bpf_reg_state *reg;
+ u16 live_regs;
int i, j;
id = id & ~BPF_ADD_CONST;
for (i = vstate->curframe; i >= 0; i--) {
+ live_regs = aux[bpf_frame_insn_idx(vstate, i)].live_regs_before;
func = vstate->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
+ if (!(live_regs & BIT(j)))
+ continue;
reg = &func->regs[j];
__collect_linked_regs(linked_regs, reg, id, i, j, true);
}
for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (!is_spilled_reg(&func->stack[j]))
+ if (!bpf_is_spilled_reg(&func->stack[j]))
continue;
reg = &func->stack[j].spilled_ptr;
__collect_linked_regs(linked_regs, reg, id, i, j, false);
@@ -17347,19 +16238,25 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
continue;
if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
continue;
+ /*
+ * Skip mixed 32/64-bit links: the delta relationship doesn't
+ * hold across different ALU widths.
+ */
+ if (((reg->id ^ known_reg->id) & BPF_ADD_CONST) == BPF_ADD_CONST)
+ continue;
if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
- reg->off == known_reg->off) {
+ reg->delta == known_reg->delta) {
s32 saved_subreg_def = reg->subreg_def;
copy_register_state(reg, known_reg);
reg->subreg_def = saved_subreg_def;
} else {
s32 saved_subreg_def = reg->subreg_def;
- s32 saved_off = reg->off;
+ s32 saved_off = reg->delta;
u32 saved_id = reg->id;
fake_reg.type = SCALAR_VALUE;
- __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off);
+ __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta);
/* reg = known_reg; reg += delta */
copy_register_state(reg, known_reg);
@@ -17367,14 +16264,14 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
* Must preserve off, id and subreg_def flag,
* otherwise another sync_linked_regs() will be incorrect.
*/
- reg->off = saved_off;
+ reg->delta = saved_off;
reg->id = saved_id;
reg->subreg_def = saved_subreg_def;
scalar32_min_max_add(reg, &fake_reg);
scalar_min_max_add(reg, &fake_reg);
reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
- if (known_reg->id & BPF_ADD_CONST32)
+ if ((reg->id | known_reg->id) & BPF_ADD_CONST32)
zext_32_to_64(reg);
reg_bounds_sync(reg);
}
@@ -17410,12 +16307,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
int idx = *insn_idx;
- if (insn->code != (BPF_JMP | BPF_JCOND) ||
- insn->src_reg != BPF_MAY_GOTO ||
- insn->dst_reg || insn->imm) {
- verbose(env, "invalid may_goto imm %d\n", insn->imm);
- return -EINVAL;
- }
prev_st = find_prev_entry(env, cur_st->parent, idx);
/* branch out 'fallthrough' insn as a new state to explore */
@@ -17437,11 +16328,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
dst_reg = &regs[insn->dst_reg];
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0) {
- verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
- return -EINVAL;
- }
-
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -17460,10 +16346,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (dst_reg->type == PTR_TO_STACK)
insn_flags |= INSN_F_DST_REG_STACK;
} else {
- if (insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
- return -EINVAL;
- }
src_reg = &env->fake_reg[0];
memset(src_reg, 0, sizeof(*src_reg));
src_reg->type = SCALAR_VALUE;
@@ -17474,13 +16356,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
if (insn_flags) {
- err = push_jmp_history(env, this_branch, insn_flags, 0);
+ err = bpf_push_jmp_history(env, this_branch, insn_flags, 0);
if (err)
return err;
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
+ copy_register_state(&env->false_reg1, dst_reg);
+ copy_register_state(&env->false_reg2, src_reg);
+ copy_register_state(&env->true_reg1, dst_reg);
+ copy_register_state(&env->true_reg2, src_reg);
+ pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32);
if (pred >= 0) {
/* If we get here with a dst_reg pointer type it is because
* above is_branch_taken() special cased the 0 comparison.
@@ -17530,11 +16416,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* if parent state is created.
*/
if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
- collect_linked_regs(this_branch, src_reg->id, &linked_regs);
+ collect_linked_regs(env, this_branch, src_reg->id, &linked_regs);
if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
- collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
+ collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs);
if (linked_regs.cnt > 1) {
- err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
if (err)
return err;
}
@@ -17544,27 +16430,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return PTR_ERR(other_branch);
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
- if (BPF_SRC(insn->code) == BPF_X) {
- err = reg_set_min_max(env,
- &other_branch_regs[insn->dst_reg],
- &other_branch_regs[insn->src_reg],
- dst_reg, src_reg, opcode, is_jmp32);
- } else /* BPF_SRC(insn->code) == BPF_K */ {
- /* reg_set_min_max() can mangle the fake_reg. Make a copy
- * so that these are two different memory locations. The
- * src_reg is not used beyond here in context of K.
- */
- memcpy(&env->fake_reg[1], &env->fake_reg[0],
- sizeof(env->fake_reg[0]));
- err = reg_set_min_max(env,
- &other_branch_regs[insn->dst_reg],
- &env->fake_reg[0],
- dst_reg, &env->fake_reg[1],
- opcode, is_jmp32);
- }
+ err = regs_bounds_sanity_check_branches(env);
if (err)
return err;
+ copy_register_state(dst_reg, &env->false_reg1);
+ copy_register_state(src_reg, &env->false_reg2);
+ copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1);
+ if (BPF_SRC(insn->code) == BPF_X)
+ copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2);
+
if (BPF_SRC(insn->code) == BPF_X &&
src_reg->type == SCALAR_VALUE && src_reg->id &&
!WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
@@ -17617,12 +16492,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
+ * Also does the same detection for a register whose the value is
+ * known to be 0.
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
*/
- if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
- insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- type_may_be_null(dst_reg->type)) {
+ if (!is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ type_may_be_null(dst_reg->type) &&
+ ((BPF_SRC(insn->code) == BPF_K && insn->imm == 0) ||
+ (BPF_SRC(insn->code) == BPF_X && bpf_register_is_null(src_reg)))) {
/* Mark all identical registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
@@ -17655,10 +16533,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
- if (insn->off != 0) {
- verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
- return -EINVAL;
- }
err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
@@ -17698,8 +16572,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_FUNC) {
struct bpf_prog_aux *aux = env->prog->aux;
- u32 subprogno = find_subprog(env,
- env->insn_idx + insn->imm + 1);
+ u32 subprogno = bpf_find_subprog(env,
+ env->insn_idx + insn->imm + 1);
if (!aux->func_info) {
verbose(env, "missing btf func_info\n");
@@ -17716,22 +16590,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
map = env->used_maps[aux->map_index];
- dst_reg->map_ptr = map;
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
if (map->map_type == BPF_MAP_TYPE_ARENA) {
__mark_reg_unknown(env, dst_reg);
+ dst_reg->map_ptr = map;
return 0;
}
+ __mark_reg_known(dst_reg, aux->map_off);
dst_reg->type = PTR_TO_MAP_VALUE;
- dst_reg->off = aux->map_off;
+ dst_reg->map_ptr = map;
WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY &&
map->max_entries != 1);
/* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
+ dst_reg->map_ptr = map;
} else {
verifier_bug(env, "unexpected src reg value for ldimm64");
return -EFAULT;
@@ -17784,13 +16660,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EFAULT;
}
- if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
- BPF_SIZE(insn->code) == BPF_DW ||
- (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
- return -EINVAL;
- }
-
/* check whether implicit source operand (register R6) is readable */
err = check_reg_arg(env, ctx_reg, SRC_OP);
if (err)
@@ -17823,7 +16692,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, regs, caller_saved[i]);
+ bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -17834,107 +16703,59 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_reg_unknown(env, regs, BPF_REG_0);
/* ld_abs load up to 32-bit skb data. */
regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
+ /*
+ * See bpf_gen_ld_abs() which emits a hidden BPF_EXIT with r0=0
+ * which must be explored by the verifier when in a subprog.
+ */
+ if (env->cur_state->curframe) {
+ struct bpf_verifier_state *branch;
+
+ mark_reg_scratched(env, BPF_REG_0);
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
+ return err;
+ env->insn_idx--;
+ }
return 0;
}
-static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
+
+static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_range *range)
{
- const char *exit_ctx = "At program exit";
- struct tnum enforce_attach_type_range = tnum_unknown;
- const struct bpf_prog *prog = env->prog;
- struct bpf_reg_state *reg = reg_state(env, regno);
- struct bpf_retval_range range = retval_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
- int err;
- struct bpf_func_state *frame = env->cur_state->frame[0];
- const bool is_subprog = frame->subprogno;
- bool return_32bit = false;
- const struct btf_type *reg_type, *ret_type = NULL;
- /* LSM and struct_ops func-ptr's return type could be "void" */
- if (!is_subprog || frame->in_exception_callback_fn) {
- switch (prog_type) {
- case BPF_PROG_TYPE_LSM:
- if (prog->expected_attach_type == BPF_LSM_CGROUP)
- /* See below, can be 0 or 0-1 depending on hook. */
- break;
- if (!prog->aux->attach_func_proto->type)
- return 0;
- break;
- case BPF_PROG_TYPE_STRUCT_OPS:
- if (!prog->aux->attach_func_proto->type)
- return 0;
+ /* Default return value range. */
+ *range = retval_range(0, 1);
- if (frame->in_exception_callback_fn)
- break;
-
- /* Allow a struct_ops program to return a referenced kptr if it
- * matches the operator's return type and is in its unmodified
- * form. A scalar zero (i.e., a null pointer) is also allowed.
- */
- reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
- ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
- prog->aux->attach_func_proto->type,
- NULL);
- if (ret_type && ret_type == reg_type && reg->ref_obj_id)
- return __check_ptr_off_reg(env, reg, regno, false);
+ switch (prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ switch (env->prog->expected_attach_type) {
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ *range = retval_range(1, 1);
+ break;
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ *range = retval_range(0, 3);
break;
default:
break;
}
- }
-
- /* eBPF calling convention is such that R0 is used
- * to return the value from eBPF program.
- * Make sure that it's readable at this time
- * of bpf_exit, which means that program wrote
- * something into it earlier
- */
- err = check_reg_arg(env, regno, SRC_OP);
- if (err)
- return err;
-
- if (is_pointer_value(env, regno)) {
- verbose(env, "R%d leaks addr as return value\n", regno);
- return -EACCES;
- }
-
- if (frame->in_async_callback_fn) {
- exit_ctx = "At async callback return";
- range = frame->callback_ret_range;
- goto enforce_retval;
- }
-
- if (is_subprog && !frame->in_exception_callback_fn) {
- if (reg->type != SCALAR_VALUE) {
- verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
- regno, reg_type_str(env, reg->type));
- return -EINVAL;
- }
- return 0;
- }
-
- switch (prog_type) {
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
- range = retval_range(1, 1);
- if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
- range = retval_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
- if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
- range = retval_range(0, 3);
- enforce_attach_type_range = tnum_range(2, 3);
- }
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS)
+ *range = retval_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_SOCK_OPS:
@@ -17944,72 +16765,164 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
break;
case BPF_PROG_TYPE_RAW_TRACEPOINT:
if (!env->prog->aux->attach_btf_id)
- return 0;
- range = retval_range(0, 0);
+ return false;
+ *range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_TRACING:
switch (env->prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_TRACE_FSESSION:
- range = retval_range(0, 0);
+ *range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
- return 0;
+ return false;
case BPF_TRACE_ITER:
- break;
default:
- return -ENOTSUPP;
+ break;
}
break;
case BPF_PROG_TYPE_KPROBE:
switch (env->prog->expected_attach_type) {
case BPF_TRACE_KPROBE_SESSION:
case BPF_TRACE_UPROBE_SESSION:
- range = retval_range(0, 1);
break;
default:
- return 0;
+ return false;
}
break;
case BPF_PROG_TYPE_SK_LOOKUP:
- range = retval_range(SK_DROP, SK_PASS);
+ *range = retval_range(SK_DROP, SK_PASS);
break;
case BPF_PROG_TYPE_LSM:
if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
/* no range found, any return value is allowed */
- if (!get_func_retval_range(env->prog, &range))
- return 0;
+ if (!get_func_retval_range(env->prog, range))
+ return false;
/* no restricted range, any return value is allowed */
- if (range.minval == S32_MIN && range.maxval == S32_MAX)
- return 0;
- return_32bit = true;
+ if (range->minval == S32_MIN && range->maxval == S32_MAX)
+ return false;
+ range->return_32bit = true;
} else if (!env->prog->aux->attach_func_proto->type) {
/* Make sure programs that attach to void
* hooks don't try to modify return value.
*/
- range = retval_range(1, 1);
+ *range = retval_range(1, 1);
}
break;
case BPF_PROG_TYPE_NETFILTER:
- range = retval_range(NF_DROP, NF_ACCEPT);
+ *range = retval_range(NF_DROP, NF_ACCEPT);
break;
case BPF_PROG_TYPE_STRUCT_OPS:
- if (!ret_type)
- return 0;
- range = retval_range(0, 0);
+ *range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
*/
default:
+ return false;
+ }
+
+ /* Continue calculating. */
+
+ return true;
+}
+
+static bool program_returns_void(struct bpf_verifier_env *env)
+{
+ const struct bpf_prog *prog = env->prog;
+ enum bpf_prog_type prog_type = prog->type;
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ /* See return_retval_range, for BPF_LSM_CGROUP can be 0 or 0-1 depending on hook. */
+ if (prog->expected_attach_type != BPF_LSM_CGROUP &&
+ !prog->aux->attach_func_proto->type)
+ return true;
+ break;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!prog->aux->attach_func_proto->type)
+ return true;
+ break;
+ case BPF_PROG_TYPE_EXT:
+ /*
+ * If the actual program is an extension, let it
+ * return void - attaching will succeed only if the
+ * program being replaced also returns void, and since
+ * it has passed verification its actual type doesn't matter.
+ */
+ if (subprog_returns_void(env, 0))
+ return true;
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
+{
+ const char *exit_ctx = "At program exit";
+ struct tnum enforce_attach_type_range = tnum_unknown;
+ const struct bpf_prog *prog = env->prog;
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ struct bpf_retval_range range = retval_range(0, 1);
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ struct bpf_func_state *frame = env->cur_state->frame[0];
+ const struct btf_type *reg_type, *ret_type = NULL;
+ int err;
+
+ /* LSM and struct_ops func-ptr's return type could be "void" */
+ if (!frame->in_async_callback_fn && program_returns_void(env))
return 0;
+
+ if (prog_type == BPF_PROG_TYPE_STRUCT_OPS) {
+ /* Allow a struct_ops program to return a referenced kptr if it
+ * matches the operator's return type and is in its unmodified
+ * form. A scalar zero (i.e., a null pointer) is also allowed.
+ */
+ reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
+ ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
+ prog->aux->attach_func_proto->type,
+ NULL);
+ if (ret_type && ret_type == reg_type && reg->ref_obj_id)
+ return __check_ptr_off_reg(env, reg, regno, false);
}
+ /* eBPF calling convention is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, regno)) {
+ verbose(env, "R%d leaks addr as return value\n", regno);
+ return -EACCES;
+ }
+
+ if (frame->in_async_callback_fn) {
+ exit_ctx = "At async callback return";
+ range = frame->callback_ret_range;
+ goto enforce_retval;
+ }
+
+ if (prog_type == BPF_PROG_TYPE_STRUCT_OPS && !ret_type)
+ return 0;
+
+ if (prog_type == BPF_PROG_TYPE_CGROUP_SKB && (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS))
+ enforce_attach_type_range = tnum_range(2, 3);
+
+ if (!return_retval_range(env, &range))
+ return 0;
+
enforce_retval:
if (reg->type != SCALAR_VALUE) {
verbose(env, "%s the register R%d is not a known value (%s)\n",
@@ -18021,10 +16934,9 @@ enforce_retval:
if (err)
return err;
- if (!retval_range_within(range, reg, return_32bit)) {
+ if (!retval_range_within(range, reg)) {
verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
- if (!is_subprog &&
- prog->expected_attach_type == BPF_LSM_CGROUP &&
+ if (prog->expected_attach_type == BPF_LSM_CGROUP &&
prog_type == BPF_PROG_TYPE_LSM &&
!prog->aux->attach_func_proto->type)
verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
@@ -18037,189 +16949,31 @@ enforce_retval:
return 0;
}
-static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
-{
- struct bpf_subprog_info *subprog;
-
- subprog = bpf_find_containing_subprog(env, off);
- subprog->changes_pkt_data = true;
-}
-
-static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
-{
- struct bpf_subprog_info *subprog;
-
- subprog = bpf_find_containing_subprog(env, off);
- subprog->might_sleep = true;
-}
-
-/* 't' is an index of a call-site.
- * 'w' is a callee entry point.
- * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
- * Rely on DFS traversal order and absence of recursive calls to guarantee that
- * callee's change_pkt_data marks would be correct at that moment.
- */
-static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
-{
- struct bpf_subprog_info *caller, *callee;
-
- caller = bpf_find_containing_subprog(env, t);
- callee = bpf_find_containing_subprog(env, w);
- caller->changes_pkt_data |= callee->changes_pkt_data;
- caller->might_sleep |= callee->might_sleep;
-}
-
-/* non-recursive DFS pseudo code
- * 1 procedure DFS-iterative(G,v):
- * 2 label v as discovered
- * 3 let S be a stack
- * 4 S.push(v)
- * 5 while S is not empty
- * 6 t <- S.peek()
- * 7 if t is what we're looking for:
- * 8 return t
- * 9 for all edges e in G.adjacentEdges(t) do
- * 10 if edge e is already labelled
- * 11 continue with the next edge
- * 12 w <- G.adjacentVertex(t,e)
- * 13 if vertex w is not discovered and not explored
- * 14 label e as tree-edge
- * 15 label w as discovered
- * 16 S.push(w)
- * 17 continue at 5
- * 18 else if vertex w is discovered
- * 19 label e as back-edge
- * 20 else
- * 21 // vertex w is explored
- * 22 label e as forward- or cross-edge
- * 23 label t as explored
- * 24 S.pop()
- *
- * convention:
- * 0x10 - discovered
- * 0x11 - discovered and fall-through edge labelled
- * 0x12 - discovered and fall-through and branch edges labelled
- * 0x20 - explored
- */
-
-enum {
- DISCOVERED = 0x10,
- EXPLORED = 0x20,
- FALLTHROUGH = 1,
- BRANCH = 2,
-};
-
-static void mark_prune_point(struct bpf_verifier_env *env, int idx)
-{
- env->insn_aux_data[idx].prune_point = true;
-}
-
-static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].prune_point;
-}
-
-static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
-{
- env->insn_aux_data[idx].force_checkpoint = true;
-}
-
-static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].force_checkpoint;
-}
-
-static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
-{
- env->insn_aux_data[idx].calls_callback = true;
-}
-
-bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].calls_callback;
-}
-
-enum {
- DONE_EXPLORING = 0,
- KEEP_EXPLORING = 1,
-};
-
-/* t, w, e - match pseudo-code above:
- * t - index of current instruction
- * w - next instruction
- * e - edge
- */
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+static int check_global_subprog_return_code(struct bpf_verifier_env *env)
{
- int *insn_stack = env->cfg.insn_stack;
- int *insn_state = env->cfg.insn_state;
-
- if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
- return DONE_EXPLORING;
+ struct bpf_reg_state *reg = reg_state(env, BPF_REG_0);
+ struct bpf_func_state *cur_frame = cur_func(env);
+ int err;
- if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
- return DONE_EXPLORING;
+ if (subprog_returns_void(env, cur_frame->subprogno))
+ return 0;
- if (w < 0 || w >= env->prog->len) {
- verbose_linfo(env, t, "%d: ", t);
- verbose(env, "jump out of range from insn %d to %d\n", t, w);
- return -EINVAL;
- }
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
- if (e == BRANCH) {
- /* mark branch target for state pruning */
- mark_prune_point(env, w);
- mark_jmp_point(env, w);
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose(env, "R%d leaks addr as return value\n", BPF_REG_0);
+ return -EACCES;
}
- if (insn_state[w] == 0) {
- /* tree-edge */
- insn_state[t] = DISCOVERED | e;
- insn_state[w] = DISCOVERED;
- if (env->cfg.cur_stack >= env->prog->len)
- return -E2BIG;
- insn_stack[env->cfg.cur_stack++] = w;
- return KEEP_EXPLORING;
- } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- if (env->bpf_capable)
- return DONE_EXPLORING;
- verbose_linfo(env, t, "%d: ", t);
- verbose_linfo(env, w, "%d: ", w);
- verbose(env, "back-edge from insn %d to %d\n", t, w);
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
+ reg_type_str(env, reg->type));
return -EINVAL;
- } else if (insn_state[w] == EXPLORED) {
- /* forward- or cross-edge */
- insn_state[t] = DISCOVERED | e;
- } else {
- verifier_bug(env, "insn state internal bug");
- return -EFAULT;
}
- return DONE_EXPLORING;
-}
-static int visit_func_call_insn(int t, struct bpf_insn *insns,
- struct bpf_verifier_env *env,
- bool visit_callee)
-{
- int ret, insn_sz;
- int w;
-
- insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
- ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
- if (ret)
- return ret;
-
- mark_prune_point(env, t + insn_sz);
- /* when we exit from subprog, we need to record non-linear history */
- mark_jmp_point(env, t + insn_sz);
-
- if (visit_callee) {
- w = t + insns[t].imm + 1;
- mark_prune_point(env, t);
- merge_callee_effects(env, t, w);
- ret = push_insn(t, w, BRANCH, env);
- }
- return ret;
+ return 0;
}
/* Bitmask with 1s for all caller saved registers */
@@ -18229,7 +16983,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
* replacement patch is presumed to follow bpf_fastcall contract
* (see mark_fastcall_pattern_for_call() below).
*/
-static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
+bool bpf_verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
{
switch (imm) {
#ifdef CONFIG_X86_64
@@ -18245,17 +16999,11 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
}
}
-struct call_summary {
- u8 num_params;
- bool is_void;
- bool fastcall;
-};
-
/* If @call is a kfunc or helper call, fills @cs and returns true,
* otherwise returns false.
*/
-static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
- struct call_summary *cs)
+bool bpf_get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
+ struct bpf_call_summary *cs)
{
struct bpf_kfunc_call_arg_meta meta;
const struct bpf_func_proto *fn;
@@ -18263,11 +17011,11 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call
if (bpf_helper_call(call)) {
- if (get_helper_proto(env, call->imm, &fn) < 0)
+ if (bpf_get_helper_proto(env, call->imm, &fn) < 0)
/* error would be reported later */
return false;
cs->fastcall = fn->allow_fastcall &&
- (verifier_inlines_helper_call(env, call->imm) ||
+ (bpf_verifier_inlines_helper_call(env, call->imm) ||
bpf_jit_inlines_helper_call(call->imm));
cs->is_void = fn->ret_type == RET_VOID;
cs->num_params = 0;
@@ -18282,7 +17030,7 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call
if (bpf_pseudo_kfunc_call(call)) {
int err;
- err = fetch_kfunc_arg_meta(env, call->imm, call->off, &meta);
+ err = bpf_fetch_kfunc_arg_meta(env, call->imm, call->off, &meta);
if (err < 0)
/* error would be reported later */
return false;
@@ -18376,12 +17124,12 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
struct bpf_insn *call = &env->prog->insnsi[insn_idx];
u32 clobbered_regs_mask;
- struct call_summary cs;
+ struct bpf_call_summary cs;
u32 expected_regs_mask;
s16 off;
int i;
- if (!get_call_summary(env, call, &cs))
+ if (!bpf_get_call_summary(env, call, &cs))
return;
/* A bitmask specifying which caller saved registers are clobbered
@@ -18484,714 +17232,6 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env)
return 0;
}
-static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem)
-{
- size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
- struct bpf_iarray *new;
-
- new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
- if (!new) {
- /* this is what callers always want, so simplify the call site */
- kvfree(old);
- return NULL;
- }
-
- new->cnt = n_elem;
- return new;
-}
-
-static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
-{
- struct bpf_insn_array_value *value;
- u32 i;
-
- for (i = start; i <= end; i++) {
- value = map->ops->map_lookup_elem(map, &i);
- /*
- * map_lookup_elem of an array map will never return an error,
- * but not checking it makes some static analysers to worry
- */
- if (IS_ERR(value))
- return PTR_ERR(value);
- else if (!value)
- return -EINVAL;
- items[i - start] = value->xlated_off;
- }
- return 0;
-}
-
-static int cmp_ptr_to_u32(const void *a, const void *b)
-{
- return *(u32 *)a - *(u32 *)b;
-}
-
-static int sort_insn_array_uniq(u32 *items, int cnt)
-{
- int unique = 1;
- int i;
-
- sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
-
- for (i = 1; i < cnt; i++)
- if (items[i] != items[unique - 1])
- items[unique++] = items[i];
-
- return unique;
-}
-
-/*
- * sort_unique({map[start], ..., map[end]}) into off
- */
-static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
-{
- u32 n = end - start + 1;
- int err;
-
- err = copy_insn_array(map, start, end, off);
- if (err)
- return err;
-
- return sort_insn_array_uniq(off, n);
-}
-
-/*
- * Copy all unique offsets from the map
- */
-static struct bpf_iarray *jt_from_map(struct bpf_map *map)
-{
- struct bpf_iarray *jt;
- int err;
- int n;
-
- jt = iarray_realloc(NULL, map->max_entries);
- if (!jt)
- return ERR_PTR(-ENOMEM);
-
- n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
- if (n < 0) {
- err = n;
- goto err_free;
- }
- if (n == 0) {
- err = -EINVAL;
- goto err_free;
- }
- jt->cnt = n;
- return jt;
-
-err_free:
- kvfree(jt);
- return ERR_PTR(err);
-}
-
-/*
- * Find and collect all maps which fit in the subprog. Return the result as one
- * combined jump table in jt->items (allocated with kvcalloc)
- */
-static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
- int subprog_start, int subprog_end)
-{
- struct bpf_iarray *jt = NULL;
- struct bpf_map *map;
- struct bpf_iarray *jt_cur;
- int i;
-
- for (i = 0; i < env->insn_array_map_cnt; i++) {
- /*
- * TODO (when needed): collect only jump tables, not static keys
- * or maps for indirect calls
- */
- map = env->insn_array_maps[i];
-
- jt_cur = jt_from_map(map);
- if (IS_ERR(jt_cur)) {
- kvfree(jt);
- return jt_cur;
- }
-
- /*
- * This is enough to check one element. The full table is
- * checked to fit inside the subprog later in create_jt()
- */
- if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
- u32 old_cnt = jt ? jt->cnt : 0;
- jt = iarray_realloc(jt, old_cnt + jt_cur->cnt);
- if (!jt) {
- kvfree(jt_cur);
- return ERR_PTR(-ENOMEM);
- }
- memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
- }
-
- kvfree(jt_cur);
- }
-
- if (!jt) {
- verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
- return ERR_PTR(-EINVAL);
- }
-
- jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
- return jt;
-}
-
-static struct bpf_iarray *
-create_jt(int t, struct bpf_verifier_env *env)
-{
- static struct bpf_subprog_info *subprog;
- int subprog_start, subprog_end;
- struct bpf_iarray *jt;
- int i;
-
- subprog = bpf_find_containing_subprog(env, t);
- subprog_start = subprog->start;
- subprog_end = (subprog + 1)->start;
- jt = jt_from_subprog(env, subprog_start, subprog_end);
- if (IS_ERR(jt))
- return jt;
-
- /* Check that the every element of the jump table fits within the given subprogram */
- for (i = 0; i < jt->cnt; i++) {
- if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
- verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
- t, subprog_start, subprog_end);
- kvfree(jt);
- return ERR_PTR(-EINVAL);
- }
- }
-
- return jt;
-}
-
-/* "conditional jump with N edges" */
-static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
-{
- int *insn_stack = env->cfg.insn_stack;
- int *insn_state = env->cfg.insn_state;
- bool keep_exploring = false;
- struct bpf_iarray *jt;
- int i, w;
-
- jt = env->insn_aux_data[t].jt;
- if (!jt) {
- jt = create_jt(t, env);
- if (IS_ERR(jt))
- return PTR_ERR(jt);
-
- env->insn_aux_data[t].jt = jt;
- }
-
- mark_prune_point(env, t);
- for (i = 0; i < jt->cnt; i++) {
- w = jt->items[i];
- if (w < 0 || w >= env->prog->len) {
- verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
- return -EINVAL;
- }
-
- mark_jmp_point(env, w);
-
- /* EXPLORED || DISCOVERED */
- if (insn_state[w])
- continue;
-
- if (env->cfg.cur_stack >= env->prog->len)
- return -E2BIG;
-
- insn_stack[env->cfg.cur_stack++] = w;
- insn_state[w] |= DISCOVERED;
- keep_exploring = true;
- }
-
- return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
-}
-
-static int visit_tailcall_insn(struct bpf_verifier_env *env, int t)
-{
- static struct bpf_subprog_info *subprog;
- struct bpf_iarray *jt;
-
- if (env->insn_aux_data[t].jt)
- return 0;
-
- jt = iarray_realloc(NULL, 2);
- if (!jt)
- return -ENOMEM;
-
- subprog = bpf_find_containing_subprog(env, t);
- jt->items[0] = t + 1;
- jt->items[1] = subprog->exit_idx;
- env->insn_aux_data[t].jt = jt;
- return 0;
-}
-
-/* Visits the instruction at index t and returns one of the following:
- * < 0 - an error occurred
- * DONE_EXPLORING - the instruction was fully explored
- * KEEP_EXPLORING - there is still work to be done before it is fully explored
- */
-static int visit_insn(int t, struct bpf_verifier_env *env)
-{
- struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
- int ret, off, insn_sz;
-
- if (bpf_pseudo_func(insn))
- return visit_func_call_insn(t, insns, env, true);
-
- /* All non-branch instructions have a single fall-through edge. */
- if (BPF_CLASS(insn->code) != BPF_JMP &&
- BPF_CLASS(insn->code) != BPF_JMP32) {
- insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
- return push_insn(t, t + insn_sz, FALLTHROUGH, env);
- }
-
- switch (BPF_OP(insn->code)) {
- case BPF_EXIT:
- return DONE_EXPLORING;
-
- case BPF_CALL:
- if (is_async_callback_calling_insn(insn))
- /* Mark this call insn as a prune point to trigger
- * is_state_visited() check before call itself is
- * processed by __check_func_call(). Otherwise new
- * async state will be pushed for further exploration.
- */
- mark_prune_point(env, t);
- /* For functions that invoke callbacks it is not known how many times
- * callback would be called. Verifier models callback calling functions
- * by repeatedly visiting callback bodies and returning to origin call
- * instruction.
- * In order to stop such iteration verifier needs to identify when a
- * state identical some state from a previous iteration is reached.
- * Check below forces creation of checkpoint before callback calling
- * instruction to allow search for such identical states.
- */
- if (is_sync_callback_calling_insn(insn)) {
- mark_calls_callback(env, t);
- mark_force_checkpoint(env, t);
- mark_prune_point(env, t);
- mark_jmp_point(env, t);
- }
- if (bpf_helper_call(insn)) {
- const struct bpf_func_proto *fp;
-
- ret = get_helper_proto(env, insn->imm, &fp);
- /* If called in a non-sleepable context program will be
- * rejected anyway, so we should end up with precise
- * sleepable marks on subprogs, except for dead code
- * elimination.
- */
- if (ret == 0 && fp->might_sleep)
- mark_subprog_might_sleep(env, t);
- if (bpf_helper_changes_pkt_data(insn->imm))
- mark_subprog_changes_pkt_data(env, t);
- if (insn->imm == BPF_FUNC_tail_call)
- visit_tailcall_insn(env, t);
- } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- struct bpf_kfunc_call_arg_meta meta;
-
- ret = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
- if (ret == 0 && is_iter_next_kfunc(&meta)) {
- mark_prune_point(env, t);
- /* Checking and saving state checkpoints at iter_next() call
- * is crucial for fast convergence of open-coded iterator loop
- * logic, so we need to force it. If we don't do that,
- * is_state_visited() might skip saving a checkpoint, causing
- * unnecessarily long sequence of not checkpointed
- * instructions and jumps, leading to exhaustion of jump
- * history buffer, and potentially other undesired outcomes.
- * It is expected that with correct open-coded iterators
- * convergence will happen quickly, so we don't run a risk of
- * exhausting memory.
- */
- mark_force_checkpoint(env, t);
- }
- /* Same as helpers, if called in a non-sleepable context
- * program will be rejected anyway, so we should end up
- * with precise sleepable marks on subprogs, except for
- * dead code elimination.
- */
- if (ret == 0 && is_kfunc_sleepable(&meta))
- mark_subprog_might_sleep(env, t);
- if (ret == 0 && is_kfunc_pkt_changing(&meta))
- mark_subprog_changes_pkt_data(env, t);
- }
- return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
-
- case BPF_JA:
- if (BPF_SRC(insn->code) == BPF_X)
- return visit_gotox_insn(t, env);
-
- if (BPF_CLASS(insn->code) == BPF_JMP)
- off = insn->off;
- else
- off = insn->imm;
-
- /* unconditional jump with single edge */
- ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
- if (ret)
- return ret;
-
- mark_prune_point(env, t + off + 1);
- mark_jmp_point(env, t + off + 1);
-
- return ret;
-
- default:
- /* conditional jump with two edges */
- mark_prune_point(env, t);
- if (is_may_goto_insn(insn))
- mark_force_checkpoint(env, t);
-
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
- if (ret)
- return ret;
-
- return push_insn(t, t + insn->off + 1, BRANCH, env);
- }
-}
-
-/* non-recursive depth-first-search to detect loops in BPF program
- * loop == back-edge in directed graph
- */
-static int check_cfg(struct bpf_verifier_env *env)
-{
- int insn_cnt = env->prog->len;
- int *insn_stack, *insn_state;
- int ex_insn_beg, i, ret = 0;
-
- insn_state = env->cfg.insn_state = kvzalloc_objs(int, insn_cnt,
- GFP_KERNEL_ACCOUNT);
- if (!insn_state)
- return -ENOMEM;
-
- insn_stack = env->cfg.insn_stack = kvzalloc_objs(int, insn_cnt,
- GFP_KERNEL_ACCOUNT);
- if (!insn_stack) {
- kvfree(insn_state);
- return -ENOMEM;
- }
-
- ex_insn_beg = env->exception_callback_subprog
- ? env->subprog_info[env->exception_callback_subprog].start
- : 0;
-
- insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
- insn_stack[0] = 0; /* 0 is the first instruction */
- env->cfg.cur_stack = 1;
-
-walk_cfg:
- while (env->cfg.cur_stack > 0) {
- int t = insn_stack[env->cfg.cur_stack - 1];
-
- ret = visit_insn(t, env);
- switch (ret) {
- case DONE_EXPLORING:
- insn_state[t] = EXPLORED;
- env->cfg.cur_stack--;
- break;
- case KEEP_EXPLORING:
- break;
- default:
- if (ret > 0) {
- verifier_bug(env, "visit_insn internal bug");
- ret = -EFAULT;
- }
- goto err_free;
- }
- }
-
- if (env->cfg.cur_stack < 0) {
- verifier_bug(env, "pop stack internal bug");
- ret = -EFAULT;
- goto err_free;
- }
-
- if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
- insn_state[ex_insn_beg] = DISCOVERED;
- insn_stack[0] = ex_insn_beg;
- env->cfg.cur_stack = 1;
- goto walk_cfg;
- }
-
- for (i = 0; i < insn_cnt; i++) {
- struct bpf_insn *insn = &env->prog->insnsi[i];
-
- if (insn_state[i] != EXPLORED) {
- verbose(env, "unreachable insn %d\n", i);
- ret = -EINVAL;
- goto err_free;
- }
- if (bpf_is_ldimm64(insn)) {
- if (insn_state[i + 1] != 0) {
- verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
- ret = -EINVAL;
- goto err_free;
- }
- i++; /* skip second half of ldimm64 */
- }
- }
- ret = 0; /* cfg looks good */
- env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
- env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;
-
-err_free:
- kvfree(insn_state);
- kvfree(insn_stack);
- env->cfg.insn_state = env->cfg.insn_stack = NULL;
- return ret;
-}
-
-/*
- * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
- * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
- * with indices of 'i' instructions in postorder.
- */
-static int compute_postorder(struct bpf_verifier_env *env)
-{
- u32 cur_postorder, i, top, stack_sz, s;
- int *stack = NULL, *postorder = NULL, *state = NULL;
- struct bpf_iarray *succ;
-
- postorder = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
- state = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
- stack = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
- if (!postorder || !state || !stack) {
- kvfree(postorder);
- kvfree(state);
- kvfree(stack);
- return -ENOMEM;
- }
- cur_postorder = 0;
- for (i = 0; i < env->subprog_cnt; i++) {
- env->subprog_info[i].postorder_start = cur_postorder;
- stack[0] = env->subprog_info[i].start;
- stack_sz = 1;
- do {
- top = stack[stack_sz - 1];
- state[top] |= DISCOVERED;
- if (state[top] & EXPLORED) {
- postorder[cur_postorder++] = top;
- stack_sz--;
- continue;
- }
- succ = bpf_insn_successors(env, top);
- for (s = 0; s < succ->cnt; ++s) {
- if (!state[succ->items[s]]) {
- stack[stack_sz++] = succ->items[s];
- state[succ->items[s]] |= DISCOVERED;
- }
- }
- state[top] |= EXPLORED;
- } while (stack_sz);
- }
- env->subprog_info[i].postorder_start = cur_postorder;
- env->cfg.insn_postorder = postorder;
- env->cfg.cur_postorder = cur_postorder;
- kvfree(stack);
- kvfree(state);
- return 0;
-}
-
-static int check_abnormal_return(struct bpf_verifier_env *env)
-{
- int i;
-
- for (i = 1; i < env->subprog_cnt; i++) {
- if (env->subprog_info[i].has_ld_abs) {
- verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
- return -EINVAL;
- }
- if (env->subprog_info[i].has_tail_call) {
- verbose(env, "tail_call is not allowed in subprogs without BTF\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
-/* The minimum supported BTF func info size */
-#define MIN_BPF_FUNCINFO_SIZE 8
-#define MAX_FUNCINFO_REC_SIZE 252
-
-static int check_btf_func_early(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- u32 krec_size = sizeof(struct bpf_func_info);
- const struct btf_type *type, *func_proto;
- u32 i, nfuncs, urec_size, min_size;
- struct bpf_func_info *krecord;
- struct bpf_prog *prog;
- const struct btf *btf;
- u32 prev_offset = 0;
- bpfptr_t urecord;
- int ret = -ENOMEM;
-
- nfuncs = attr->func_info_cnt;
- if (!nfuncs) {
- if (check_abnormal_return(env))
- return -EINVAL;
- return 0;
- }
-
- urec_size = attr->func_info_rec_size;
- if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
- urec_size > MAX_FUNCINFO_REC_SIZE ||
- urec_size % sizeof(u32)) {
- verbose(env, "invalid func info rec size %u\n", urec_size);
- return -EINVAL;
- }
-
- prog = env->prog;
- btf = prog->aux->btf;
-
- urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
- min_size = min_t(u32, krec_size, urec_size);
-
- krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (!krecord)
- return -ENOMEM;
-
- for (i = 0; i < nfuncs; i++) {
- ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
- if (ret) {
- if (ret == -E2BIG) {
- verbose(env, "nonzero tailing record in func info");
- /* set the size kernel expects so loader can zero
- * out the rest of the record.
- */
- if (copy_to_bpfptr_offset(uattr,
- offsetof(union bpf_attr, func_info_rec_size),
- &min_size, sizeof(min_size)))
- ret = -EFAULT;
- }
- goto err_free;
- }
-
- if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
- ret = -EFAULT;
- goto err_free;
- }
-
- /* check insn_off */
- ret = -EINVAL;
- if (i == 0) {
- if (krecord[i].insn_off) {
- verbose(env,
- "nonzero insn_off %u for the first func info record",
- krecord[i].insn_off);
- goto err_free;
- }
- } else if (krecord[i].insn_off <= prev_offset) {
- verbose(env,
- "same or smaller insn offset (%u) than previous func info record (%u)",
- krecord[i].insn_off, prev_offset);
- goto err_free;
- }
-
- /* check type_id */
- type = btf_type_by_id(btf, krecord[i].type_id);
- if (!type || !btf_type_is_func(type)) {
- verbose(env, "invalid type id %d in func info",
- krecord[i].type_id);
- goto err_free;
- }
-
- func_proto = btf_type_by_id(btf, type->type);
- if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
- /* btf_func_check() already verified it during BTF load */
- goto err_free;
-
- prev_offset = krecord[i].insn_off;
- bpfptr_add(&urecord, urec_size);
- }
-
- prog->aux->func_info = krecord;
- prog->aux->func_info_cnt = nfuncs;
- return 0;
-
-err_free:
- kvfree(krecord);
- return ret;
-}
-
-static int check_btf_func(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- const struct btf_type *type, *func_proto, *ret_type;
- u32 i, nfuncs, urec_size;
- struct bpf_func_info *krecord;
- struct bpf_func_info_aux *info_aux = NULL;
- struct bpf_prog *prog;
- const struct btf *btf;
- bpfptr_t urecord;
- bool scalar_return;
- int ret = -ENOMEM;
-
- nfuncs = attr->func_info_cnt;
- if (!nfuncs) {
- if (check_abnormal_return(env))
- return -EINVAL;
- return 0;
- }
- if (nfuncs != env->subprog_cnt) {
- verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
- return -EINVAL;
- }
-
- urec_size = attr->func_info_rec_size;
-
- prog = env->prog;
- btf = prog->aux->btf;
-
- urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
-
- krecord = prog->aux->func_info;
- info_aux = kzalloc_objs(*info_aux, nfuncs,
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (!info_aux)
- return -ENOMEM;
-
- for (i = 0; i < nfuncs; i++) {
- /* check insn_off */
- ret = -EINVAL;
-
- if (env->subprog_info[i].start != krecord[i].insn_off) {
- verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
- goto err_free;
- }
-
- /* Already checked type_id */
- type = btf_type_by_id(btf, krecord[i].type_id);
- info_aux[i].linkage = BTF_INFO_VLEN(type->info);
- /* Already checked func_proto */
- func_proto = btf_type_by_id(btf, type->type);
-
- ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
- scalar_return =
- btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
- if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
- verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
- goto err_free;
- }
- if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
- verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
- goto err_free;
- }
-
- bpfptr_add(&urecord, urec_size);
- }
-
- prog->aux->func_info_aux = info_aux;
- return 0;
-
-err_free:
- kfree(info_aux);
- return ret;
-}
-
static void adjust_btf_func(struct bpf_verifier_env *env)
{
struct bpf_prog_aux *aux = env->prog->aux;
@@ -19205,414 +17245,6 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
-#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
-#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
-
-static int check_btf_line(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
- struct bpf_subprog_info *sub;
- struct bpf_line_info *linfo;
- struct bpf_prog *prog;
- const struct btf *btf;
- bpfptr_t ulinfo;
- int err;
-
- nr_linfo = attr->line_info_cnt;
- if (!nr_linfo)
- return 0;
- if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
- return -EINVAL;
-
- rec_size = attr->line_info_rec_size;
- if (rec_size < MIN_BPF_LINEINFO_SIZE ||
- rec_size > MAX_LINEINFO_REC_SIZE ||
- rec_size & (sizeof(u32) - 1))
- return -EINVAL;
-
- /* Need to zero it in case the userspace may
- * pass in a smaller bpf_line_info object.
- */
- linfo = kvzalloc_objs(struct bpf_line_info, nr_linfo,
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (!linfo)
- return -ENOMEM;
-
- prog = env->prog;
- btf = prog->aux->btf;
-
- s = 0;
- sub = env->subprog_info;
- ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
- expected_size = sizeof(struct bpf_line_info);
- ncopy = min_t(u32, expected_size, rec_size);
- for (i = 0; i < nr_linfo; i++) {
- err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
- if (err) {
- if (err == -E2BIG) {
- verbose(env, "nonzero tailing record in line_info");
- if (copy_to_bpfptr_offset(uattr,
- offsetof(union bpf_attr, line_info_rec_size),
- &expected_size, sizeof(expected_size)))
- err = -EFAULT;
- }
- goto err_free;
- }
-
- if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
- err = -EFAULT;
- goto err_free;
- }
-
- /*
- * Check insn_off to ensure
- * 1) strictly increasing AND
- * 2) bounded by prog->len
- *
- * The linfo[0].insn_off == 0 check logically falls into
- * the later "missing bpf_line_info for func..." case
- * because the first linfo[0].insn_off must be the
- * first sub also and the first sub must have
- * subprog_info[0].start == 0.
- */
- if ((i && linfo[i].insn_off <= prev_offset) ||
- linfo[i].insn_off >= prog->len) {
- verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
- i, linfo[i].insn_off, prev_offset,
- prog->len);
- err = -EINVAL;
- goto err_free;
- }
-
- if (!prog->insnsi[linfo[i].insn_off].code) {
- verbose(env,
- "Invalid insn code at line_info[%u].insn_off\n",
- i);
- err = -EINVAL;
- goto err_free;
- }
-
- if (!btf_name_by_offset(btf, linfo[i].line_off) ||
- !btf_name_by_offset(btf, linfo[i].file_name_off)) {
- verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
- err = -EINVAL;
- goto err_free;
- }
-
- if (s != env->subprog_cnt) {
- if (linfo[i].insn_off == sub[s].start) {
- sub[s].linfo_idx = i;
- s++;
- } else if (sub[s].start < linfo[i].insn_off) {
- verbose(env, "missing bpf_line_info for func#%u\n", s);
- err = -EINVAL;
- goto err_free;
- }
- }
-
- prev_offset = linfo[i].insn_off;
- bpfptr_add(&ulinfo, rec_size);
- }
-
- if (s != env->subprog_cnt) {
- verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
- env->subprog_cnt - s, s);
- err = -EINVAL;
- goto err_free;
- }
-
- prog->aux->linfo = linfo;
- prog->aux->nr_linfo = nr_linfo;
-
- return 0;
-
-err_free:
- kvfree(linfo);
- return err;
-}
-
-#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
-#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
-
-static int check_core_relo(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- u32 i, nr_core_relo, ncopy, expected_size, rec_size;
- struct bpf_core_relo core_relo = {};
- struct bpf_prog *prog = env->prog;
- const struct btf *btf = prog->aux->btf;
- struct bpf_core_ctx ctx = {
- .log = &env->log,
- .btf = btf,
- };
- bpfptr_t u_core_relo;
- int err;
-
- nr_core_relo = attr->core_relo_cnt;
- if (!nr_core_relo)
- return 0;
- if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
- return -EINVAL;
-
- rec_size = attr->core_relo_rec_size;
- if (rec_size < MIN_CORE_RELO_SIZE ||
- rec_size > MAX_CORE_RELO_SIZE ||
- rec_size % sizeof(u32))
- return -EINVAL;
-
- u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
- expected_size = sizeof(struct bpf_core_relo);
- ncopy = min_t(u32, expected_size, rec_size);
-
- /* Unlike func_info and line_info, copy and apply each CO-RE
- * relocation record one at a time.
- */
- for (i = 0; i < nr_core_relo; i++) {
- /* future proofing when sizeof(bpf_core_relo) changes */
- err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
- if (err) {
- if (err == -E2BIG) {
- verbose(env, "nonzero tailing record in core_relo");
- if (copy_to_bpfptr_offset(uattr,
- offsetof(union bpf_attr, core_relo_rec_size),
- &expected_size, sizeof(expected_size)))
- err = -EFAULT;
- }
- break;
- }
-
- if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
- err = -EFAULT;
- break;
- }
-
- if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
- verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
- i, core_relo.insn_off, prog->len);
- err = -EINVAL;
- break;
- }
-
- err = bpf_core_apply(&ctx, &core_relo, i,
- &prog->insnsi[core_relo.insn_off / 8]);
- if (err)
- break;
- bpfptr_add(&u_core_relo, rec_size);
- }
- return err;
-}
-
-static int check_btf_info_early(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- struct btf *btf;
- int err;
-
- if (!attr->func_info_cnt && !attr->line_info_cnt) {
- if (check_abnormal_return(env))
- return -EINVAL;
- return 0;
- }
-
- btf = btf_get_by_fd(attr->prog_btf_fd);
- if (IS_ERR(btf))
- return PTR_ERR(btf);
- if (btf_is_kernel(btf)) {
- btf_put(btf);
- return -EACCES;
- }
- env->prog->aux->btf = btf;
-
- err = check_btf_func_early(env, attr, uattr);
- if (err)
- return err;
- return 0;
-}
-
-static int check_btf_info(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- int err;
-
- if (!attr->func_info_cnt && !attr->line_info_cnt) {
- if (check_abnormal_return(env))
- return -EINVAL;
- return 0;
- }
-
- err = check_btf_func(env, attr, uattr);
- if (err)
- return err;
-
- err = check_btf_line(env, attr, uattr);
- if (err)
- return err;
-
- err = check_core_relo(env, attr, uattr);
- if (err)
- return err;
-
- return 0;
-}
-
-/* check %cur's range satisfies %old's */
-static bool range_within(const struct bpf_reg_state *old,
- const struct bpf_reg_state *cur)
-{
- return old->umin_value <= cur->umin_value &&
- old->umax_value >= cur->umax_value &&
- old->smin_value <= cur->smin_value &&
- old->smax_value >= cur->smax_value &&
- old->u32_min_value <= cur->u32_min_value &&
- old->u32_max_value >= cur->u32_max_value &&
- old->s32_min_value <= cur->s32_min_value &&
- old->s32_max_value >= cur->s32_max_value;
-}
-
-/* If in the old state two registers had the same id, then they need to have
- * the same id in the new state as well. But that id could be different from
- * the old state, so we need to track the mapping from old to new ids.
- * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
- * regs with old id 5 must also have new id 9 for the new state to be safe. But
- * regs with a different old id could still have new id 9, we don't care about
- * that.
- * So we look through our idmap to see if this old id has been seen before. If
- * so, we require the new id to match; otherwise, we add the id pair to the map.
- */
-static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
-{
- struct bpf_id_pair *map = idmap->map;
- unsigned int i;
-
- /* either both IDs should be set or both should be zero */
- if (!!old_id != !!cur_id)
- return false;
-
- if (old_id == 0) /* cur_id == 0 as well */
- return true;
-
- for (i = 0; i < idmap->cnt; i++) {
- if (map[i].old == old_id)
- return map[i].cur == cur_id;
- if (map[i].cur == cur_id)
- return false;
- }
-
- /* Reached the end of known mappings; haven't seen this id before */
- if (idmap->cnt < BPF_ID_MAP_SIZE) {
- map[idmap->cnt].old = old_id;
- map[idmap->cnt].cur = cur_id;
- idmap->cnt++;
- return true;
- }
-
- /* We ran out of idmap slots, which should be impossible */
- WARN_ON_ONCE(1);
- return false;
-}
-
-/*
- * Compare scalar register IDs for state equivalence.
- *
- * When old_id == 0, the old register is independent - not linked to any
- * other register. Any linking in the current state only adds constraints,
- * making it more restrictive. Since the old state didn't rely on any ID
- * relationships for this register, it's always safe to accept cur regardless
- * of its ID. Hence, return true immediately.
- *
- * When old_id != 0 but cur_id == 0, we need to ensure that different
- * independent registers in cur don't incorrectly satisfy the ID matching
- * requirements of linked registers in old.
- *
- * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0
- * and r7.id=0 (both independent), without temp IDs both would map old_id=X
- * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map
- * X->temp2, but X is already mapped to temp1, so the check fails correctly.
- */
-static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
-{
- if (!old_id)
- return true;
-
- cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
-
- return check_ids(old_id, cur_id, idmap);
-}
-
-static void clean_func_state(struct bpf_verifier_env *env,
- struct bpf_func_state *st,
- u32 ip)
-{
- u16 live_regs = env->insn_aux_data[ip].live_regs_before;
- int i, j;
-
- for (i = 0; i < BPF_REG_FP; i++) {
- /* liveness must not touch this register anymore */
- if (!(live_regs & BIT(i)))
- /* since the register is unused, clear its state
- * to make further comparison simpler
- */
- __mark_reg_not_init(env, &st->regs[i]);
- }
-
- for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
- if (!bpf_stack_slot_alive(env, st->frameno, i)) {
- __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
- for (j = 0; j < BPF_REG_SIZE; j++)
- st->stack[i].slot_type[j] = STACK_INVALID;
- }
- }
-}
-
-static void clean_verifier_state(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
-{
- int i, ip;
-
- bpf_live_stack_query_init(env, st);
- st->cleaned = true;
- for (i = 0; i <= st->curframe; i++) {
- ip = frame_insn_idx(st, i);
- clean_func_state(env, st->frame[i], ip);
- }
-}
-
-/* the parentage chains form a tree.
- * the verifier states are added to state lists at given insn and
- * pushed into state stack for future exploration.
- * when the verifier reaches bpf_exit insn some of the verifier states
- * stored in the state lists have their final liveness state already,
- * but a lot of states will get revised from liveness point of view when
- * the verifier explores other branches.
- * Example:
- * 1: *(u64)(r10 - 8) = 1
- * 2: if r1 == 100 goto pc+1
- * 3: *(u64)(r10 - 8) = 2
- * 4: r0 = *(u64)(r10 - 8)
- * 5: exit
- * when the verifier reaches exit insn the stack slot -8 in the state list of
- * insn 2 is not yet marked alive. Then the verifier pops the other_branch
- * of insn 2 and goes exploring further. After the insn 4 read, liveness
- * analysis would propagate read mark for -8 at insn 2.
- *
- * Since the verifier pushes the branch states as it sees them while exploring
- * the program the condition of walking the branch instruction for the second
- * time means that all states below this branch were already explored and
- * their final liveness marks are already propagated.
- * Hence when the verifier completes the search of state list in is_state_visited()
- * we can call this clean_live_states() function to clear dead the registers and stack
- * slots to simplify state merging.
- *
- * Important note here that walking the same branch instruction in the callee
- * doesn't meant that the states are DONE. The verifier has to compare
- * the callsites
- */
-
/* Find id in idset and increment its count, or add new entry */
static void idset_cnt_inc(struct bpf_idset *idset, u32 id)
{
@@ -19649,8 +17281,8 @@ static u32 idset_cnt_get(struct bpf_idset *idset, u32 id)
* A register with a non-zero id is called singular if no other register shares
* the same base id. Such registers can be treated as independent (id=0).
*/
-static void clear_singular_ids(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
+void bpf_clear_singular_ids(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
{
struct bpf_idset *idset = &env->idset_scratch;
struct bpf_func_state *func;
@@ -19671,1064 +17303,11 @@ static void clear_singular_ids(struct bpf_verifier_env *env,
continue;
if (!reg->id)
continue;
- if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) {
- reg->id = 0;
- reg->off = 0;
- }
+ if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1)
+ clear_scalar_id(reg);
}));
}
-static void clean_live_states(struct bpf_verifier_env *env, int insn,
- struct bpf_verifier_state *cur)
-{
- struct bpf_verifier_state_list *sl;
- struct list_head *pos, *head;
-
- head = explored_state(env, insn);
- list_for_each(pos, head) {
- sl = container_of(pos, struct bpf_verifier_state_list, node);
- if (sl->state.branches)
- continue;
- if (sl->state.insn_idx != insn ||
- !same_callsites(&sl->state, cur))
- continue;
- if (sl->state.cleaned)
- /* all regs in this state in all frames were already marked */
- continue;
- if (incomplete_read_marks(env, &sl->state))
- continue;
- clean_verifier_state(env, &sl->state);
- }
-}
-
-static bool regs_exact(const struct bpf_reg_state *rold,
- const struct bpf_reg_state *rcur,
- struct bpf_idmap *idmap)
-{
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
- check_ids(rold->id, rcur->id, idmap) &&
- check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
-}
-
-enum exact_level {
- NOT_EXACT,
- EXACT,
- RANGE_WITHIN
-};
-
-/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
- enum exact_level exact)
-{
- if (exact == EXACT)
- return regs_exact(rold, rcur, idmap);
-
- if (rold->type == NOT_INIT)
- /* explored state can't have used this */
- return true;
-
- /* Enforce that register types have to match exactly, including their
- * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
- * rule.
- *
- * One can make a point that using a pointer register as unbounded
- * SCALAR would be technically acceptable, but this could lead to
- * pointer leaks because scalars are allowed to leak while pointers
- * are not. We could make this safe in special cases if root is
- * calling us, but it's probably not worth the hassle.
- *
- * Also, register types that are *not* MAYBE_NULL could technically be
- * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
- * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
- * to the same map).
- * However, if the old MAYBE_NULL register then got NULL checked,
- * doing so could have affected others with the same id, and we can't
- * check for that because we lost the id when we converted to
- * a non-MAYBE_NULL variant.
- * So, as a general rule we don't allow mixing MAYBE_NULL and
- * non-MAYBE_NULL registers as well.
- */
- if (rold->type != rcur->type)
- return false;
-
- switch (base_type(rold->type)) {
- case SCALAR_VALUE:
- if (env->explore_alu_limits) {
- /* explore_alu_limits disables tnum_in() and range_within()
- * logic and requires everything to be strict
- */
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
- check_scalar_ids(rold->id, rcur->id, idmap);
- }
- if (!rold->precise && exact == NOT_EXACT)
- return true;
- /*
- * Linked register tracking uses rold->id to detect relationships.
- * When rold->id == 0, the register is independent and any linking
- * in rcur only adds constraints. When rold->id != 0, we must verify
- * id mapping and (for BPF_ADD_CONST) offset consistency.
- *
- * +------------------+-----------+------------------+---------------+
- * | | rold->id | rold + ADD_CONST | rold->id == 0 |
- * |------------------+-----------+------------------+---------------|
- * | rcur->id | range,ids | false | range |
- * | rcur + ADD_CONST | false | range,ids,off | range |
- * | rcur->id == 0 | range,ids | false | range |
- * +------------------+-----------+------------------+---------------+
- *
- * Why check_ids() for scalar registers?
- *
- * Consider the following BPF code:
- * 1: r6 = ... unbound scalar, ID=a ...
- * 2: r7 = ... unbound scalar, ID=b ...
- * 3: if (r6 > r7) goto +1
- * 4: r6 = r7
- * 5: if (r6 > X) goto ...
- * 6: ... memory operation using r7 ...
- *
- * First verification path is [1-6]:
- * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
- * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
- * r7 <= X, because r6 and r7 share same id.
- * Next verification path is [1-4, 6].
- *
- * Instruction (6) would be reached in two states:
- * I. r6{.id=b}, r7{.id=b} via path 1-6;
- * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
- *
- * Use check_ids() to distinguish these states.
- * ---
- * Also verify that new value satisfies old value range knowledge.
- */
-
- /* ADD_CONST mismatch: different linking semantics */
- if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST))
- return false;
-
- if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST))
- return false;
-
- /* Both have offset linkage: offsets must match */
- if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off)
- return false;
-
- if (!check_scalar_ids(rold->id, rcur->id, idmap))
- return false;
-
- return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_MAP_KEY:
- case PTR_TO_MAP_VALUE:
- case PTR_TO_MEM:
- case PTR_TO_BUF:
- case PTR_TO_TP_BUFFER:
- /* If the new min/max/var_off satisfy the old ones and
- * everything else matches, we are OK.
- */
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
- range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off) &&
- check_ids(rold->id, rcur->id, idmap) &&
- check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
- case PTR_TO_PACKET_META:
- case PTR_TO_PACKET:
- /* We must have at least as much range as the old ptr
- * did, so that any accesses which were safe before are
- * still safe. This is true even if old range < old off,
- * since someone could have accessed through (ptr - k), or
- * even done ptr -= k in a register, to get a safe access.
- */
- if (rold->range > rcur->range)
- return false;
- /* If the offsets don't match, we can't trust our alignment;
- * nor can we be sure that we won't fall out of range.
- */
- if (rold->off != rcur->off)
- return false;
- /* id relations must be preserved */
- if (!check_ids(rold->id, rcur->id, idmap))
- return false;
- /* new val must satisfy old val knowledge */
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_STACK:
- /* two stack pointers are equal only if they're pointing to
- * the same stack frame, since fp-8 in foo != fp-8 in bar
- */
- return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
- case PTR_TO_ARENA:
- return true;
- case PTR_TO_INSN:
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
- rold->off == rcur->off && range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- default:
- return regs_exact(rold, rcur, idmap);
- }
-}
-
-static struct bpf_reg_state unbound_reg;
-
-static __init int unbound_reg_init(void)
-{
- __mark_reg_unknown_imprecise(&unbound_reg);
- return 0;
-}
-late_initcall(unbound_reg_init);
-
-static bool is_stack_all_misc(struct bpf_verifier_env *env,
- struct bpf_stack_state *stack)
-{
- u32 i;
-
- for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) {
- if ((stack->slot_type[i] == STACK_MISC) ||
- (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
- continue;
- return false;
- }
-
- return true;
-}
-
-static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
- struct bpf_stack_state *stack)
-{
- if (is_spilled_scalar_reg64(stack))
- return &stack->spilled_ptr;
-
- if (is_stack_all_misc(env, stack))
- return &unbound_reg;
-
- return NULL;
-}
-
-static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_idmap *idmap,
- enum exact_level exact)
-{
- int i, spi;
-
- /* walk slots of the explored stack and ignore any additional
- * slots in the current stack, since explored(safe) state
- * didn't use them
- */
- for (i = 0; i < old->allocated_stack; i++) {
- struct bpf_reg_state *old_reg, *cur_reg;
-
- spi = i / BPF_REG_SIZE;
-
- if (exact == EXACT &&
- (i >= cur->allocated_stack ||
- old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
- cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
- return false;
-
- if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
- continue;
-
- if (env->allow_uninit_stack &&
- old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
- continue;
-
- /* explored stack has more populated slots than current stack
- * and these slots were used
- */
- if (i >= cur->allocated_stack)
- return false;
-
- /* 64-bit scalar spill vs all slots MISC and vice versa.
- * Load from all slots MISC produces unbound scalar.
- * Construct a fake register for such stack and call
- * regsafe() to ensure scalar ids are compared.
- */
- old_reg = scalar_reg_for_stack(env, &old->stack[spi]);
- cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]);
- if (old_reg && cur_reg) {
- if (!regsafe(env, old_reg, cur_reg, idmap, exact))
- return false;
- i += BPF_REG_SIZE - 1;
- continue;
- }
-
- /* if old state was safe with misc data in the stack
- * it will be safe with zero-initialized stack.
- * The opposite is not true
- */
- if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
- cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
- continue;
- if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
- cur->stack[spi].slot_type[i % BPF_REG_SIZE])
- /* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has STACK_MISC ->
- * this verifier states are not equivalent,
- * return false to continue verification of this path
- */
- return false;
- if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
- continue;
- /* Both old and cur are having same slot_type */
- switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
- case STACK_SPILL:
- /* when explored and current stack slot are both storing
- * spilled registers, check that stored pointers types
- * are the same as well.
- * Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
- * but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
- * such verifier states are not equivalent.
- * return false to continue verification of this path
- */
- if (!regsafe(env, &old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr, idmap, exact))
- return false;
- break;
- case STACK_DYNPTR:
- old_reg = &old->stack[spi].spilled_ptr;
- cur_reg = &cur->stack[spi].spilled_ptr;
- if (old_reg->dynptr.type != cur_reg->dynptr.type ||
- old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
- !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
- return false;
- break;
- case STACK_ITER:
- old_reg = &old->stack[spi].spilled_ptr;
- cur_reg = &cur->stack[spi].spilled_ptr;
- /* iter.depth is not compared between states as it
- * doesn't matter for correctness and would otherwise
- * prevent convergence; we maintain it only to prevent
- * infinite loop check triggering, see
- * iter_active_depths_differ()
- */
- if (old_reg->iter.btf != cur_reg->iter.btf ||
- old_reg->iter.btf_id != cur_reg->iter.btf_id ||
- old_reg->iter.state != cur_reg->iter.state ||
- /* ignore {old_reg,cur_reg}->iter.depth, see above */
- !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
- return false;
- break;
- case STACK_IRQ_FLAG:
- old_reg = &old->stack[spi].spilled_ptr;
- cur_reg = &cur->stack[spi].spilled_ptr;
- if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
- old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
- return false;
- break;
- case STACK_MISC:
- case STACK_ZERO:
- case STACK_INVALID:
- continue;
- /* Ensure that new unhandled slot types return false by default */
- default:
- return false;
- }
- }
- return true;
-}
-
-static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
- struct bpf_idmap *idmap)
-{
- int i;
-
- if (old->acquired_refs != cur->acquired_refs)
- return false;
-
- if (old->active_locks != cur->active_locks)
- return false;
-
- if (old->active_preempt_locks != cur->active_preempt_locks)
- return false;
-
- if (old->active_rcu_locks != cur->active_rcu_locks)
- return false;
-
- if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
- return false;
-
- if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
- old->active_lock_ptr != cur->active_lock_ptr)
- return false;
-
- for (i = 0; i < old->acquired_refs; i++) {
- if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
- old->refs[i].type != cur->refs[i].type)
- return false;
- switch (old->refs[i].type) {
- case REF_TYPE_PTR:
- case REF_TYPE_IRQ:
- break;
- case REF_TYPE_LOCK:
- case REF_TYPE_RES_LOCK:
- case REF_TYPE_RES_LOCK_IRQ:
- if (old->refs[i].ptr != cur->refs[i].ptr)
- return false;
- break;
- default:
- WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
- return false;
- }
- }
-
- return true;
-}
-
-/* compare two verifier states
- *
- * all states stored in state_list are known to be valid, since
- * verifier reached 'bpf_exit' instruction through them
- *
- * this function is called when verifier exploring different branches of
- * execution popped from the state stack. If it sees an old state that has
- * more strict register state and more strict stack state then this execution
- * branch doesn't need to be explored further, since verifier already
- * concluded that more strict state leads to valid finish.
- *
- * Therefore two states are equivalent if register state is more conservative
- * and explored stack state is more conservative than the current one.
- * Example:
- * explored current
- * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
- * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
- *
- * In other words if current stack state (one being explored) has more
- * valid slots than old one that already passed validation, it means
- * the verifier can stop exploring and conclude that current state is valid too
- *
- * Similarly with registers. If explored state has register type as invalid
- * whereas register type in current state is meaningful, it means that
- * the current state will reach 'bpf_exit' instruction safely
- */
-static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
-{
- u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
- u16 i;
-
- if (old->callback_depth > cur->callback_depth)
- return false;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (((1 << i) & live_regs) &&
- !regsafe(env, &old->regs[i], &cur->regs[i],
- &env->idmap_scratch, exact))
- return false;
-
- if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
- return false;
-
- return true;
-}
-
-static void reset_idmap_scratch(struct bpf_verifier_env *env)
-{
- struct bpf_idmap *idmap = &env->idmap_scratch;
-
- idmap->tmp_id_gen = env->id_gen;
- idmap->cnt = 0;
-}
-
-static bool states_equal(struct bpf_verifier_env *env,
- struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur,
- enum exact_level exact)
-{
- u32 insn_idx;
- int i;
-
- if (old->curframe != cur->curframe)
- return false;
-
- reset_idmap_scratch(env);
-
- /* Verification state from speculative execution simulation
- * must never prune a non-speculative execution one.
- */
- if (old->speculative && !cur->speculative)
- return false;
-
- if (old->in_sleepable != cur->in_sleepable)
- return false;
-
- if (!refsafe(old, cur, &env->idmap_scratch))
- return false;
-
- /* for states to be equal callsites have to be the same
- * and all frame states need to be equivalent
- */
- for (i = 0; i <= old->curframe; i++) {
- insn_idx = frame_insn_idx(old, i);
- if (old->frame[i]->callsite != cur->frame[i]->callsite)
- return false;
- if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
- return false;
- }
- return true;
-}
-
-/* find precise scalars in the previous equivalent state and
- * propagate them into the current state
- */
-static int propagate_precision(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur,
- bool *changed)
-{
- struct bpf_reg_state *state_reg;
- struct bpf_func_state *state;
- int i, err = 0, fr;
- bool first;
-
- for (fr = old->curframe; fr >= 0; fr--) {
- state = old->frame[fr];
- state_reg = state->regs;
- first = true;
- for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2) {
- if (first)
- verbose(env, "frame %d: propagating r%d", fr, i);
- else
- verbose(env, ",r%d", i);
- }
- bt_set_frame_reg(&env->bt, fr, i);
- first = false;
- }
-
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (!is_spilled_reg(&state->stack[i]))
- continue;
- state_reg = &state->stack[i].spilled_ptr;
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2) {
- if (first)
- verbose(env, "frame %d: propagating fp%d",
- fr, (-i - 1) * BPF_REG_SIZE);
- else
- verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
- }
- bt_set_frame_slot(&env->bt, fr, i);
- first = false;
- }
- if (!first && (env->log.level & BPF_LOG_LEVEL2))
- verbose(env, "\n");
- }
-
- err = __mark_chain_precision(env, cur, -1, changed);
- if (err < 0)
- return err;
-
- return 0;
-}
-
-#define MAX_BACKEDGE_ITERS 64
-
-/* Propagate read and precision marks from visit->backedges[*].state->equal_state
- * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
- * then free visit->backedges.
- * After execution of this function incomplete_read_marks() will return false
- * for all states corresponding to @visit->callchain.
- */
-static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
-{
- struct bpf_scc_backedge *backedge;
- struct bpf_verifier_state *st;
- bool changed;
- int i, err;
-
- i = 0;
- do {
- if (i++ > MAX_BACKEDGE_ITERS) {
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "%s: too many iterations\n", __func__);
- for (backedge = visit->backedges; backedge; backedge = backedge->next)
- mark_all_scalars_precise(env, &backedge->state);
- break;
- }
- changed = false;
- for (backedge = visit->backedges; backedge; backedge = backedge->next) {
- st = &backedge->state;
- err = propagate_precision(env, st->equal_state, st, &changed);
- if (err)
- return err;
- }
- } while (changed);
-
- free_backedges(visit);
- return 0;
-}
-
-static bool states_maybe_looping(struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur)
-{
- struct bpf_func_state *fold, *fcur;
- int i, fr = cur->curframe;
-
- if (old->curframe != fr)
- return false;
-
- fold = old->frame[fr];
- fcur = cur->frame[fr];
- for (i = 0; i < MAX_BPF_REG; i++)
- if (memcmp(&fold->regs[i], &fcur->regs[i],
- offsetof(struct bpf_reg_state, frameno)))
- return false;
- return true;
-}
-
-static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].is_iter_next;
-}
-
-/* is_state_visited() handles iter_next() (see process_iter_next_call() for
- * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
- * states to match, which otherwise would look like an infinite loop. So while
- * iter_next() calls are taken care of, we still need to be careful and
- * prevent erroneous and too eager declaration of "infinite loop", when
- * iterators are involved.
- *
- * Here's a situation in pseudo-BPF assembly form:
- *
- * 0: again: ; set up iter_next() call args
- * 1: r1 = &it ; <CHECKPOINT HERE>
- * 2: call bpf_iter_num_next ; this is iter_next() call
- * 3: if r0 == 0 goto done
- * 4: ... something useful here ...
- * 5: goto again ; another iteration
- * 6: done:
- * 7: r1 = &it
- * 8: call bpf_iter_num_destroy ; clean up iter state
- * 9: exit
- *
- * This is a typical loop. Let's assume that we have a prune point at 1:,
- * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
- * again`, assuming other heuristics don't get in a way).
- *
- * When we first time come to 1:, let's say we have some state X. We proceed
- * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
- * Now we come back to validate that forked ACTIVE state. We proceed through
- * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
- * are converging. But the problem is that we don't know that yet, as this
- * convergence has to happen at iter_next() call site only. So if nothing is
- * done, at 1: verifier will use bounded loop logic and declare infinite
- * looping (and would be *technically* correct, if not for iterator's
- * "eventual sticky NULL" contract, see process_iter_next_call()). But we
- * don't want that. So what we do in process_iter_next_call() when we go on
- * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
- * a different iteration. So when we suspect an infinite loop, we additionally
- * check if any of the *ACTIVE* iterator states depths differ. If yes, we
- * pretend we are not looping and wait for next iter_next() call.
- *
- * This only applies to ACTIVE state. In DRAINED state we don't expect to
- * loop, because that would actually mean infinite loop, as DRAINED state is
- * "sticky", and so we'll keep returning into the same instruction with the
- * same state (at least in one of possible code paths).
- *
- * This approach allows to keep infinite loop heuristic even in the face of
- * active iterator. E.g., C snippet below is and will be detected as
- * infinitely looping:
- *
- * struct bpf_iter_num it;
- * int *p, x;
- *
- * bpf_iter_num_new(&it, 0, 10);
- * while ((p = bpf_iter_num_next(&t))) {
- * x = p;
- * while (x--) {} // <<-- infinite loop here
- * }
- *
- */
-static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
-{
- struct bpf_reg_state *slot, *cur_slot;
- struct bpf_func_state *state;
- int i, fr;
-
- for (fr = old->curframe; fr >= 0; fr--) {
- state = old->frame[fr];
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] != STACK_ITER)
- continue;
-
- slot = &state->stack[i].spilled_ptr;
- if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
- continue;
-
- cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
- if (cur_slot->iter.depth != slot->iter.depth)
- return true;
- }
- }
- return false;
-}
-
-static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
-{
- struct bpf_verifier_state_list *new_sl;
- struct bpf_verifier_state_list *sl;
- struct bpf_verifier_state *cur = env->cur_state, *new;
- bool force_new_state, add_new_state, loop;
- int n, err, states_cnt = 0;
- struct list_head *pos, *tmp, *head;
-
- force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
- /* Avoid accumulating infinitely long jmp history */
- cur->jmp_history_cnt > 40;
-
- /* bpf progs typically have pruning point every 4 instructions
- * http://vger.kernel.org/bpfconf2019.html#session-1
- * Do not add new state for future pruning if the verifier hasn't seen
- * at least 2 jumps and at least 8 instructions.
- * This heuristics helps decrease 'total_states' and 'peak_states' metric.
- * In tests that amounts to up to 50% reduction into total verifier
- * memory consumption and 20% verifier time speedup.
- */
- add_new_state = force_new_state;
- if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
- env->insn_processed - env->prev_insn_processed >= 8)
- add_new_state = true;
-
- clean_live_states(env, insn_idx, cur);
-
- loop = false;
- head = explored_state(env, insn_idx);
- list_for_each_safe(pos, tmp, head) {
- sl = container_of(pos, struct bpf_verifier_state_list, node);
- states_cnt++;
- if (sl->state.insn_idx != insn_idx)
- continue;
-
- if (sl->state.branches) {
- struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
-
- if (frame->in_async_callback_fn &&
- frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
- /* Different async_entry_cnt means that the verifier is
- * processing another entry into async callback.
- * Seeing the same state is not an indication of infinite
- * loop or infinite recursion.
- * But finding the same state doesn't mean that it's safe
- * to stop processing the current state. The previous state
- * hasn't yet reached bpf_exit, since state.branches > 0.
- * Checking in_async_callback_fn alone is not enough either.
- * Since the verifier still needs to catch infinite loops
- * inside async callbacks.
- */
- goto skip_inf_loop_check;
- }
- /* BPF open-coded iterators loop detection is special.
- * states_maybe_looping() logic is too simplistic in detecting
- * states that *might* be equivalent, because it doesn't know
- * about ID remapping, so don't even perform it.
- * See process_iter_next_call() and iter_active_depths_differ()
- * for overview of the logic. When current and one of parent
- * states are detected as equivalent, it's a good thing: we prove
- * convergence and can stop simulating further iterations.
- * It's safe to assume that iterator loop will finish, taking into
- * account iter_next() contract of eventually returning
- * sticky NULL result.
- *
- * Note, that states have to be compared exactly in this case because
- * read and precision marks might not be finalized inside the loop.
- * E.g. as in the program below:
- *
- * 1. r7 = -16
- * 2. r6 = bpf_get_prandom_u32()
- * 3. while (bpf_iter_num_next(&fp[-8])) {
- * 4. if (r6 != 42) {
- * 5. r7 = -32
- * 6. r6 = bpf_get_prandom_u32()
- * 7. continue
- * 8. }
- * 9. r0 = r10
- * 10. r0 += r7
- * 11. r8 = *(u64 *)(r0 + 0)
- * 12. r6 = bpf_get_prandom_u32()
- * 13. }
- *
- * Here verifier would first visit path 1-3, create a checkpoint at 3
- * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
- * not have read or precision mark for r7 yet, thus inexact states
- * comparison would discard current state with r7=-32
- * => unsafe memory access at 11 would not be caught.
- */
- if (is_iter_next_insn(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- struct bpf_func_state *cur_frame;
- struct bpf_reg_state *iter_state, *iter_reg;
- int spi;
-
- cur_frame = cur->frame[cur->curframe];
- /* btf_check_iter_kfuncs() enforces that
- * iter state pointer is always the first arg
- */
- iter_reg = &cur_frame->regs[BPF_REG_1];
- /* current state is valid due to states_equal(),
- * so we can assume valid iter and reg state,
- * no need for extra (re-)validations
- */
- spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
- iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
- if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
- loop = true;
- goto hit;
- }
- }
- goto skip_inf_loop_check;
- }
- if (is_may_goto_insn_at(env, insn_idx)) {
- if (sl->state.may_goto_depth != cur->may_goto_depth &&
- states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- loop = true;
- goto hit;
- }
- }
- if (bpf_calls_callback(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- loop = true;
- goto hit;
- }
- goto skip_inf_loop_check;
- }
- /* attempt to detect infinite loop to avoid unnecessary doomed work */
- if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur, EXACT) &&
- !iter_active_depths_differ(&sl->state, cur) &&
- sl->state.may_goto_depth == cur->may_goto_depth &&
- sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
- verbose_linfo(env, insn_idx, "; ");
- verbose(env, "infinite loop detected at insn %d\n", insn_idx);
- verbose(env, "cur state:");
- print_verifier_state(env, cur, cur->curframe, true);
- verbose(env, "old state:");
- print_verifier_state(env, &sl->state, cur->curframe, true);
- return -EINVAL;
- }
- /* if the verifier is processing a loop, avoid adding new state
- * too often, since different loop iterations have distinct
- * states and may not help future pruning.
- * This threshold shouldn't be too low to make sure that
- * a loop with large bound will be rejected quickly.
- * The most abusive loop will be:
- * r1 += 1
- * if r1 < 1000000 goto pc-2
- * 1M insn_procssed limit / 100 == 10k peak states.
- * This threshold shouldn't be too high either, since states
- * at the end of the loop are likely to be useful in pruning.
- */
-skip_inf_loop_check:
- if (!force_new_state &&
- env->jmps_processed - env->prev_jmps_processed < 20 &&
- env->insn_processed - env->prev_insn_processed < 100)
- add_new_state = false;
- goto miss;
- }
- /* See comments for mark_all_regs_read_and_precise() */
- loop = incomplete_read_marks(env, &sl->state);
- if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
-hit:
- sl->hit_cnt++;
-
- /* if previous state reached the exit with precision and
- * current state is equivalent to it (except precision marks)
- * the precision needs to be propagated back in
- * the current state.
- */
- err = 0;
- if (is_jmp_point(env, env->insn_idx))
- err = push_jmp_history(env, cur, 0, 0);
- err = err ? : propagate_precision(env, &sl->state, cur, NULL);
- if (err)
- return err;
- /* When processing iterator based loops above propagate_liveness and
- * propagate_precision calls are not sufficient to transfer all relevant
- * read and precision marks. E.g. consider the following case:
- *
- * .-> A --. Assume the states are visited in the order A, B, C.
- * | | | Assume that state B reaches a state equivalent to state A.
- * | v v At this point, state C is not processed yet, so state A
- * '-- B C has not received any read or precision marks from C.
- * Thus, marks propagated from A to B are incomplete.
- *
- * The verifier mitigates this by performing the following steps:
- *
- * - Prior to the main verification pass, strongly connected components
- * (SCCs) are computed over the program's control flow graph,
- * intraprocedurally.
- *
- * - During the main verification pass, `maybe_enter_scc()` checks
- * whether the current verifier state is entering an SCC. If so, an
- * instance of a `bpf_scc_visit` object is created, and the state
- * entering the SCC is recorded as the entry state.
- *
- * - This instance is associated not with the SCC itself, but with a
- * `bpf_scc_callchain`: a tuple consisting of the call sites leading to
- * the SCC and the SCC id. See `compute_scc_callchain()`.
- *
- * - When a verification path encounters a `states_equal(...,
- * RANGE_WITHIN)` condition, there exists a call chain describing the
- * current state and a corresponding `bpf_scc_visit` instance. A copy
- * of the current state is created and added to
- * `bpf_scc_visit->backedges`.
- *
- * - When a verification path terminates, `maybe_exit_scc()` is called
- * from `update_branch_counts()`. For states with `branches == 0`, it
- * checks whether the state is the entry state of any `bpf_scc_visit`
- * instance. If it is, this indicates that all paths originating from
- * this SCC visit have been explored. `propagate_backedges()` is then
- * called, which propagates read and precision marks through the
- * backedges until a fixed point is reached.
- * (In the earlier example, this would propagate marks from A to B,
- * from C to A, and then again from A to B.)
- *
- * A note on callchains
- * --------------------
- *
- * Consider the following example:
- *
- * void foo() { loop { ... SCC#1 ... } }
- * void main() {
- * A: foo();
- * B: ...
- * C: foo();
- * }
- *
- * Here, there are two distinct callchains leading to SCC#1:
- * - (A, SCC#1)
- * - (C, SCC#1)
- *
- * Each callchain identifies a separate `bpf_scc_visit` instance that
- * accumulates backedge states. The `propagate_{liveness,precision}()`
- * functions traverse the parent state of each backedge state, which
- * means these parent states must remain valid (i.e., not freed) while
- * the corresponding `bpf_scc_visit` instance exists.
- *
- * Associating `bpf_scc_visit` instances directly with SCCs instead of
- * callchains would break this invariant:
- * - States explored during `C: foo()` would contribute backedges to
- * SCC#1, but SCC#1 would only be exited once the exploration of
- * `A: foo()` completes.
- * - By that time, the states explored between `A: foo()` and `C: foo()`
- * (i.e., `B: ...`) may have already been freed, causing the parent
- * links for states from `C: foo()` to become invalid.
- */
- if (loop) {
- struct bpf_scc_backedge *backedge;
-
- backedge = kzalloc_obj(*backedge,
- GFP_KERNEL_ACCOUNT);
- if (!backedge)
- return -ENOMEM;
- err = copy_verifier_state(&backedge->state, cur);
- backedge->state.equal_state = &sl->state;
- backedge->state.insn_idx = insn_idx;
- err = err ?: add_scc_backedge(env, &sl->state, backedge);
- if (err) {
- free_verifier_state(&backedge->state, false);
- kfree(backedge);
- return err;
- }
- }
- return 1;
- }
-miss:
- /* when new state is not going to be added do not increase miss count.
- * Otherwise several loop iterations will remove the state
- * recorded earlier. The goal of these heuristics is to have
- * states from some iterations of the loop (some in the beginning
- * and some at the end) to help pruning.
- */
- if (add_new_state)
- sl->miss_cnt++;
- /* heuristic to determine whether this state is beneficial
- * to keep checking from state equivalence point of view.
- * Higher numbers increase max_states_per_insn and verification time,
- * but do not meaningfully decrease insn_processed.
- * 'n' controls how many times state could miss before eviction.
- * Use bigger 'n' for checkpoints because evicting checkpoint states
- * too early would hinder iterator convergence.
- */
- n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
- if (sl->miss_cnt > sl->hit_cnt * n + n) {
- /* the state is unlikely to be useful. Remove it to
- * speed up verification
- */
- sl->in_free_list = true;
- list_del(&sl->node);
- list_add(&sl->node, &env->free_list);
- env->free_list_size++;
- env->explored_states_size--;
- maybe_free_verifier_state(env, sl);
- }
- }
-
- if (env->max_states_per_insn < states_cnt)
- env->max_states_per_insn = states_cnt;
-
- if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return 0;
-
- if (!add_new_state)
- return 0;
-
- /* There were no equivalent states, remember the current one.
- * Technically the current state is not proven to be safe yet,
- * but it will either reach outer most bpf_exit (which means it's safe)
- * or it will be rejected. When there are no loops the verifier won't be
- * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
- * again on the way to bpf_exit.
- * When looping the sl->state.branches will be > 0 and this state
- * will not be considered for equivalence until branches == 0.
- */
- new_sl = kzalloc_obj(struct bpf_verifier_state_list, GFP_KERNEL_ACCOUNT);
- if (!new_sl)
- return -ENOMEM;
- env->total_states++;
- env->explored_states_size++;
- update_peak_states(env);
- env->prev_jmps_processed = env->jmps_processed;
- env->prev_insn_processed = env->insn_processed;
-
- /* forget precise markings we inherited, see __mark_chain_precision */
- if (env->bpf_capable)
- mark_all_scalars_imprecise(env, cur);
-
- clear_singular_ids(env, cur);
-
- /* add new state to the head of linked list */
- new = &new_sl->state;
- err = copy_verifier_state(new, cur);
- if (err) {
- free_verifier_state(new, false);
- kfree(new_sl);
- return err;
- }
- new->insn_idx = insn_idx;
- verifier_bug_if(new->branches != 1, env,
- "%s:branches_to_explore=%d insn %d",
- __func__, new->branches, insn_idx);
- err = maybe_enter_scc(env, new);
- if (err) {
- free_verifier_state(new, false);
- kfree(new_sl);
- return err;
- }
-
- cur->parent = new;
- cur->first_insn_idx = insn_idx;
- cur->dfs_depth = new->dfs_depth + 1;
- clear_jmp_history(cur);
- list_add(&new_sl->node, head);
- return 0;
-}
-
/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
@@ -20829,13 +17408,16 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
}
enum {
- PROCESS_BPF_EXIT = 1
+ PROCESS_BPF_EXIT = 1,
+ INSN_IDX_UPDATED = 2,
};
static int process_bpf_exit_full(struct bpf_verifier_env *env,
bool *do_print_state,
bool exception_exit)
{
+ struct bpf_func_state *cur_frame = cur_func(env);
+
/* We must do check_reference_leak here before
* prepare_func_exit to handle the case when
* state->curframe > 0, it may be a callback function,
@@ -20843,7 +17425,8 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
* state when it exits.
*/
int err = check_resource_leak(env, exception_exit,
- !env->cur_state->curframe,
+ exception_exit || !env->cur_state->curframe,
+ exception_exit ? "bpf_throw" :
"BPF_EXIT instruction in main prog");
if (err)
return err;
@@ -20866,10 +17449,24 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
if (err)
return err;
*do_print_state = true;
- return 0;
+ return INSN_IDX_UPDATED;
}
- err = check_return_code(env, BPF_REG_0, "R0");
+ /*
+ * Return from a regular global subprogram differs from return
+ * from the main program or async/exception callback.
+ * Main program exit implies return code restrictions
+ * that depend on program type.
+ * Exit from exception callback is equivalent to main program exit.
+ * Exit from async callback implies return code restrictions
+ * that depend on async scheduling mechanism.
+ */
+ if (cur_frame->subprogno &&
+ !cur_frame->in_async_callback_fn &&
+ !cur_frame->in_exception_callback_fn)
+ err = check_global_subprog_return_code(env);
+ else
+ err = check_return_code(env, BPF_REG_0, "R0");
if (err)
return err;
return PROCESS_BPF_EXIT;
@@ -20881,19 +17478,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
u32 *pmin_index, u32 *pmax_index)
{
struct bpf_reg_state *reg = reg_state(env, regno);
- u64 min_index, max_index;
+ u64 min_index = reg->umin_value;
+ u64 max_index = reg->umax_value;
const u32 size = 8;
- if (check_add_overflow(reg->umin_value, reg->off, &min_index) ||
- (min_index > (u64) U32_MAX * size)) {
- verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n",
- regno, reg->umin_value, reg->off);
+ if (min_index > (u64) U32_MAX * size) {
+ verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value);
return -ERANGE;
}
- if (check_add_overflow(reg->umax_value, reg->off, &max_index) ||
- (max_index > (u64) U32_MAX * size)) {
- verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n",
- regno, reg->umax_value, reg->off);
+ if (max_index > (u64) U32_MAX * size) {
+ verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value);
return -ERANGE;
}
@@ -20943,13 +17537,13 @@ static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *in
/* Ensure that the buffer is large enough */
if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) {
- env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf,
- max_index - min_index + 1);
+ env->gotox_tmp_buf = bpf_iarray_realloc(env->gotox_tmp_buf,
+ max_index - min_index + 1);
if (!env->gotox_tmp_buf)
return -ENOMEM;
}
- n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
+ n = bpf_copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
if (n < 0)
return n;
if (n == 0) {
@@ -20959,13 +17553,15 @@ static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *in
}
for (i = 0; i < n - 1; i++) {
+ mark_indirect_target(env, env->gotox_tmp_buf->items[i]);
other_branch = push_stack(env, env->gotox_tmp_buf->items[i],
env->insn_idx, env->cur_state->speculative);
if (IS_ERR(other_branch))
return PTR_ERR(other_branch);
}
env->insn_idx = env->gotox_tmp_buf->items[n-1];
- return 0;
+ mark_indirect_target(env, env->insn_idx);
+ return INSN_IDX_UPDATED;
}
static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
@@ -20974,81 +17570,48 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx];
u8 class = BPF_CLASS(insn->code);
- if (class == BPF_ALU || class == BPF_ALU64) {
- err = check_alu_op(env, insn);
- if (err)
- return err;
-
- } else if (class == BPF_LDX) {
- bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX;
+ switch (class) {
+ case BPF_ALU:
+ case BPF_ALU64:
+ return check_alu_op(env, insn);
- /* Check for reserved fields is already done in
- * resolve_pseudo_ldimm64().
- */
- err = check_load_mem(env, insn, false, is_ldsx, true, "ldx");
- if (err)
- return err;
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_ATOMIC) {
- err = check_atomic(env, insn);
- if (err)
- return err;
- env->insn_idx++;
- return 0;
- }
+ case BPF_LDX:
+ return check_load_mem(env, insn, false,
+ BPF_MODE(insn->code) == BPF_MEMSX,
+ true, "ldx");
- if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
- verbose(env, "BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC)
+ return check_atomic(env, insn);
+ return check_store_reg(env, insn, false);
- err = check_store_reg(env, insn, false);
- if (err)
- return err;
- } else if (class == BPF_ST) {
+ case BPF_ST: {
enum bpf_reg_type dst_reg_type;
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_ST uses reserved fields\n");
- return -EINVAL;
- }
- /* check src operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
dst_reg_type = cur_regs(env)[insn->dst_reg].type;
- /* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
BPF_WRITE, -1, false, false);
if (err)
return err;
- err = save_aux_ptr_type(env, dst_reg_type, false);
- if (err)
- return err;
- } else if (class == BPF_JMP || class == BPF_JMP32) {
+ return save_aux_ptr_type(env, dst_reg_type, false);
+ }
+ case BPF_JMP:
+ case BPF_JMP32: {
u8 opcode = BPF_OP(insn->code);
env->jmps_processed++;
if (opcode == BPF_CALL) {
- if (BPF_SRC(insn->code) != BPF_K ||
- (insn->src_reg != BPF_PSEUDO_KFUNC_CALL &&
- insn->off != 0) ||
- (insn->src_reg != BPF_REG_0 &&
- insn->src_reg != BPF_PSEUDO_CALL &&
- insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
- insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) {
- verbose(env, "BPF_CALL uses reserved fields\n");
- return -EINVAL;
- }
-
if (env->cur_state->active_locks) {
if ((insn->src_reg == BPF_REG_0 &&
- insn->imm != BPF_FUNC_spin_unlock) ||
+ insn->imm != BPF_FUNC_spin_unlock &&
+ insn->imm != BPF_FUNC_kptr_xchg) ||
(insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
(insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
verbose(env,
@@ -21056,84 +17619,45 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
return -EINVAL;
}
}
- if (insn->src_reg == BPF_PSEUDO_CALL) {
- err = check_func_call(env, insn, &env->insn_idx);
- } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- err = check_kfunc_call(env, insn, &env->insn_idx);
- if (!err && is_bpf_throw_kfunc(insn))
- return process_bpf_exit_full(env, do_print_state, true);
- } else {
- err = check_helper_call(env, insn, &env->insn_idx);
- }
- if (err)
- return err;
-
mark_reg_scratched(env, BPF_REG_0);
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ return check_func_call(env, insn, &env->insn_idx);
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ return check_kfunc_call(env, insn, &env->insn_idx);
+ return check_helper_call(env, insn, &env->insn_idx);
} else if (opcode == BPF_JA) {
- if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->src_reg != BPF_REG_0 ||
- insn->imm != 0 || insn->off != 0) {
- verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
- return -EINVAL;
- }
+ if (BPF_SRC(insn->code) == BPF_X)
return check_indirect_jump(env, insn);
- }
-
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0 ||
- (class == BPF_JMP && insn->imm != 0) ||
- (class == BPF_JMP32 && insn->off != 0)) {
- verbose(env, "BPF_JA uses reserved fields\n");
- return -EINVAL;
- }
if (class == BPF_JMP)
env->insn_idx += insn->off + 1;
else
env->insn_idx += insn->imm + 1;
- return 0;
+ return INSN_IDX_UPDATED;
} else if (opcode == BPF_EXIT) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
- verbose(env, "BPF_EXIT uses reserved fields\n");
- return -EINVAL;
- }
return process_bpf_exit_full(env, do_print_state, false);
- } else {
- err = check_cond_jmp_op(env, insn, &env->insn_idx);
- if (err)
- return err;
}
- } else if (class == BPF_LD) {
+ return check_cond_jmp_op(env, insn, &env->insn_idx);
+ }
+ case BPF_LD: {
u8 mode = BPF_MODE(insn->code);
- if (mode == BPF_ABS || mode == BPF_IND) {
- err = check_ld_abs(env, insn);
- if (err)
- return err;
+ if (mode == BPF_ABS || mode == BPF_IND)
+ return check_ld_abs(env, insn);
- } else if (mode == BPF_IMM) {
+ if (mode == BPF_IMM) {
err = check_ld_imm(env, insn);
if (err)
return err;
env->insn_idx++;
sanitize_mark_insn_seen(env);
- } else {
- verbose(env, "invalid BPF_LD mode\n");
- return -EINVAL;
}
- } else {
- verbose(env, "unknown insn class %d\n", class);
- return -EINVAL;
+ return 0;
}
-
- env->insn_idx++;
- return 0;
+ }
+ /* all class values are handled above. silence compiler warning */
+ return -EFAULT;
}
static int do_check(struct bpf_verifier_env *env)
@@ -21148,7 +17672,7 @@ static int do_check(struct bpf_verifier_env *env)
for (;;) {
struct bpf_insn *insn;
struct bpf_insn_aux_data *insn_aux;
- int err, marks_err;
+ int err;
/* reset current history entry on each new instruction */
env->cur_hist_ent = NULL;
@@ -21173,8 +17697,8 @@ static int do_check(struct bpf_verifier_env *env)
state->last_insn_idx = env->prev_insn_idx;
state->insn_idx = env->insn_idx;
- if (is_prune_point(env, env->insn_idx)) {
- err = is_state_visited(env, env->insn_idx);
+ if (bpf_is_prune_point(env, env->insn_idx)) {
+ err = bpf_is_state_visited(env, env->insn_idx);
if (err < 0)
return err;
if (err == 1) {
@@ -21192,8 +17716,8 @@ static int do_check(struct bpf_verifier_env *env)
}
}
- if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state, 0, 0);
+ if (bpf_is_jmp_point(env, env->insn_idx)) {
+ err = bpf_push_jmp_history(env, state, 0, 0);
if (err)
return err;
}
@@ -21220,7 +17744,7 @@ static int do_check(struct bpf_verifier_env *env)
verbose_linfo(env, env->insn_idx, "; ");
env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
- verbose_insn(env, insn);
+ bpf_verbose_insn(env, insn);
env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
env->prev_log_pos = env->log.end_pos;
}
@@ -21235,21 +17759,34 @@ static int do_check(struct bpf_verifier_env *env)
sanitize_mark_insn_seen(env);
prev_insn_idx = env->insn_idx;
+ /* Sanity check: precomputed constants must match verifier state */
+ if (!state->speculative && insn_aux->const_reg_mask) {
+ struct bpf_reg_state *regs = cur_regs(env);
+ u16 mask = insn_aux->const_reg_mask;
+
+ for (int r = 0; r < ARRAY_SIZE(insn_aux->const_reg_vals); r++) {
+ u32 cval = insn_aux->const_reg_vals[r];
+
+ if (!(mask & BIT(r)))
+ continue;
+ if (regs[r].type != SCALAR_VALUE)
+ continue;
+ if (!tnum_is_const(regs[r].var_off))
+ continue;
+ if (verifier_bug_if((u32)regs[r].var_off.value != cval,
+ env, "const R%d: %u != %llu",
+ r, cval, regs[r].var_off.value))
+ return -EFAULT;
+ }
+ }
+
/* Reduce verification complexity by stopping speculative path
* verification when a nospec is encountered.
*/
if (state->speculative && insn_aux->nospec)
goto process_bpf_exit;
- err = bpf_reset_stack_write_marks(env, env->insn_idx);
- if (err)
- return err;
err = do_check_insn(env, &do_print_state);
- if (err >= 0 || error_recoverable_with_nospec(err)) {
- marks_err = bpf_commit_stack_write_marks(env);
- if (marks_err)
- return marks_err;
- }
if (error_recoverable_with_nospec(err) && state->speculative) {
/* Prevent this speculative path from ever reaching the
* insn that would have been unsafe to execute.
@@ -21264,8 +17801,10 @@ static int do_check(struct bpf_verifier_env *env)
return err;
} else if (err == PROCESS_BPF_EXIT) {
goto process_bpf_exit;
+ } else if (err == INSN_IDX_UPDATED) {
+ } else if (err == 0) {
+ env->insn_idx++;
}
- WARN_ON_ONCE(err);
if (state->speculative && insn_aux->nospec_result) {
/* If we are on a path that performed a jump-op, this
@@ -21289,10 +17828,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EFAULT;
process_bpf_exit:
mark_verifier_state_scratched(env);
- err = update_branch_counts(env, env->cur_state);
- if (err)
- return err;
- err = bpf_update_live_stack(env);
+ err = bpf_update_branch_counts(env, env->cur_state);
if (err)
return err;
err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
@@ -21685,14 +18221,199 @@ static int add_used_map(struct bpf_verifier_env *env, int fd)
return __add_used_map(env, map);
}
-/* find and rewrite pseudo imm in ld_imm64 instructions:
+static int check_alu_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+
+ switch (opcode) {
+ case BPF_NEG:
+ if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose(env, "BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_END:
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ (class == BPF_ALU64 && BPF_SRC(insn->code) != BPF_TO_LE)) {
+ verbose(env, "BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_MOV:
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (class == BPF_ALU) {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
+ insn->imm) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->off == BPF_ADDR_SPACE_CAST) {
+ if (insn->imm != 1 && insn->imm != 1u << 16) {
+ verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
+ return -EINVAL;
+ }
+ } else if ((insn->off != 0 && insn->off != 8 &&
+ insn->off != 16 && insn->off != 32) || insn->imm) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_ARSH:
+ case BPF_MUL:
+ case BPF_DIV:
+ case BPF_MOD:
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
+ verbose(env, "BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->src_reg != BPF_REG_0 ||
+ (insn->off != 0 && insn->off != 1) ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
+ verbose(env, "BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ default:
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+ }
+}
+
+static int check_jmp_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+
+ switch (opcode) {
+ case BPF_CALL:
+ if (BPF_SRC(insn->code) != BPF_K ||
+ (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && insn->off != 0) ||
+ (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
+ insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) {
+ verbose(env, "BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_JA:
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->src_reg != BPF_REG_0 || insn->imm != 0 || insn->off != 0) {
+ verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 ||
+ (class == BPF_JMP && insn->imm != 0) ||
+ (class == BPF_JMP32 && insn->off != 0)) {
+ verbose(env, "BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_EXIT:
+ if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
+ verbose(env, "BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_JCOND:
+ if (insn->code != (BPF_JMP | BPF_JCOND) || insn->src_reg != BPF_MAY_GOTO ||
+ insn->dst_reg || insn->imm) {
+ verbose(env, "invalid may_goto imm %d\n", insn->imm);
+ return -EINVAL;
+ }
+ return 0;
+ default:
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->src_reg != BPF_REG_0) {
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ }
+}
+
+static int check_insn_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_ALU:
+ case BPF_ALU64:
+ return check_alu_fields(env, insn);
+ case BPF_LDX:
+ if ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
+ insn->imm != 0) {
+ verbose(env, "BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC)
+ return 0;
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
+ verbose(env, "BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_ST:
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) {
+ verbose(env, "BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_JMP:
+ case BPF_JMP32:
+ return check_jmp_fields(env, insn);
+ case BPF_LD: {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ BPF_SIZE(insn->code) == BPF_DW ||
+ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (mode != BPF_IMM) {
+ verbose(env, "invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ return 0;
+ }
+ default:
+ verbose(env, "unknown insn class %d\n", BPF_CLASS(insn->code));
+ return -EINVAL;
+ }
+}
+
+/*
+ * Check that insns are sane and rewrite pseudo imm in ld_imm64 instructions:
*
* 1. if it accesses map FD, replace it with actual map pointer.
* 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
*
* NOTE: btf_vmlinux is required for converting pseudo btf_id.
*/
-static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
+static int check_and_resolve_insns(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -21703,13 +18424,14 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return err;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (BPF_CLASS(insn->code) == BPF_LDX &&
- ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
- insn->imm != 0)) {
- verbose(env, "BPF_LDX uses reserved fields\n");
+ if (insn->dst_reg >= MAX_BPF_REG) {
+ verbose(env, "R%d is invalid\n", insn->dst_reg);
+ return -EINVAL;
+ }
+ if (insn->src_reg >= MAX_BPF_REG) {
+ verbose(env, "R%d is invalid\n", insn->src_reg);
return -EINVAL;
}
-
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
@@ -21724,6 +18446,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return -EINVAL;
}
+ if (insn[0].off != 0) {
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
if (insn[0].src_reg == 0)
/* valid generic load 64-bit imm */
goto next_insn;
@@ -21820,6 +18547,10 @@ next_insn:
verbose(env, "unknown opcode %02x\n", insn->code);
return -EINVAL;
}
+
+ err = check_insn_fields(env, insn);
+ if (err)
+ return err;
}
/* now all pseudo BPF_LD_IMM64 instructions load valid
@@ -21858,53 +18589,6 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
}
}
-/* single env->prog->insni[off] instruction was replaced with the range
- * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
- * [0, off) and [off, end) to new locations, so the patched range stays zero
- */
-static void adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_prog *new_prog, u32 off, u32 cnt)
-{
- struct bpf_insn_aux_data *data = env->insn_aux_data;
- struct bpf_insn *insn = new_prog->insnsi;
- u32 old_seen = data[off].seen;
- u32 prog_len;
- int i;
-
- /* aux info at OFF always needs adjustment, no matter fast path
- * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
- * original insn at old prog.
- */
- data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);
-
- if (cnt == 1)
- return;
- prog_len = new_prog->len;
-
- memmove(data + off + cnt - 1, data + off,
- sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
- memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
- for (i = off; i < off + cnt - 1; i++) {
- /* Expand insni[off]'s seen count to the patched range. */
- data[i].seen = old_seen;
- data[i].zext_dst = insn_has_def32(insn + i);
- }
-}
-
-static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
-{
- int i;
-
- if (len == 1)
- return;
- /* NOTE: fake 'exit' subprog should be updated as well. */
- for (i = 0; i <= env->subprog_cnt; i++) {
- if (env->subprog_info[i].start <= off)
- continue;
- env->subprog_info[i].start += len - 1;
- }
-}
-
static void release_insn_arrays(struct bpf_verifier_env *env)
{
int i;
@@ -21913,281 +18597,7 @@ static void release_insn_arrays(struct bpf_verifier_env *env)
bpf_insn_array_release(env->insn_array_maps[i]);
}
-static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
-{
- int i;
-
- if (len == 1)
- return;
-
- for (i = 0; i < env->insn_array_map_cnt; i++)
- bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
-}
-
-static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
-{
- int i;
-
- for (i = 0; i < env->insn_array_map_cnt; i++)
- bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
-}
-
-static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
-{
- struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
- int i, sz = prog->aux->size_poke_tab;
- struct bpf_jit_poke_descriptor *desc;
-
- for (i = 0; i < sz; i++) {
- desc = &tab[i];
- if (desc->insn_idx <= off)
- continue;
- desc->insn_idx += len - 1;
- }
-}
-
-static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
- const struct bpf_insn *patch, u32 len)
-{
- struct bpf_prog *new_prog;
- struct bpf_insn_aux_data *new_data = NULL;
-
- if (len > 1) {
- new_data = vrealloc(env->insn_aux_data,
- array_size(env->prog->len + len - 1,
- sizeof(struct bpf_insn_aux_data)),
- GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!new_data)
- return NULL;
-
- env->insn_aux_data = new_data;
- }
- new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
- if (IS_ERR(new_prog)) {
- if (PTR_ERR(new_prog) == -ERANGE)
- verbose(env,
- "insn %d cannot be patched due to 16-bit range\n",
- env->insn_aux_data[off].orig_idx);
- return NULL;
- }
- adjust_insn_aux_data(env, new_prog, off, len);
- adjust_subprog_starts(env, off, len);
- adjust_insn_arrays(env, off, len);
- adjust_poke_descs(new_prog, off, len);
- return new_prog;
-}
-
-/*
- * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
- * jump offset by 'delta'.
- */
-static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
-{
- struct bpf_insn *insn = prog->insnsi;
- u32 insn_cnt = prog->len, i;
- s32 imm;
- s16 off;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- u8 code = insn->code;
-
- if (tgt_idx <= i && i < tgt_idx + delta)
- continue;
-
- if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
- BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
- continue;
-
- if (insn->code == (BPF_JMP32 | BPF_JA)) {
- if (i + 1 + insn->imm != tgt_idx)
- continue;
- if (check_add_overflow(insn->imm, delta, &imm))
- return -ERANGE;
- insn->imm = imm;
- } else {
- if (i + 1 + insn->off != tgt_idx)
- continue;
- if (check_add_overflow(insn->off, delta, &off))
- return -ERANGE;
- insn->off = off;
- }
- }
- return 0;
-}
-
-static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
- u32 off, u32 cnt)
-{
- int i, j;
-
- /* find first prog starting at or after off (first to remove) */
- for (i = 0; i < env->subprog_cnt; i++)
- if (env->subprog_info[i].start >= off)
- break;
- /* find first prog starting at or after off + cnt (first to stay) */
- for (j = i; j < env->subprog_cnt; j++)
- if (env->subprog_info[j].start >= off + cnt)
- break;
- /* if j doesn't start exactly at off + cnt, we are just removing
- * the front of previous prog
- */
- if (env->subprog_info[j].start != off + cnt)
- j--;
-
- if (j > i) {
- struct bpf_prog_aux *aux = env->prog->aux;
- int move;
-
- /* move fake 'exit' subprog as well */
- move = env->subprog_cnt + 1 - j;
-
- memmove(env->subprog_info + i,
- env->subprog_info + j,
- sizeof(*env->subprog_info) * move);
- env->subprog_cnt -= j - i;
-
- /* remove func_info */
- if (aux->func_info) {
- move = aux->func_info_cnt - j;
-
- memmove(aux->func_info + i,
- aux->func_info + j,
- sizeof(*aux->func_info) * move);
- aux->func_info_cnt -= j - i;
- /* func_info->insn_off is set after all code rewrites,
- * in adjust_btf_func() - no need to adjust
- */
- }
- } else {
- /* convert i from "first prog to remove" to "first to adjust" */
- if (env->subprog_info[i].start == off)
- i++;
- }
-
- /* update fake 'exit' subprog as well */
- for (; i <= env->subprog_cnt; i++)
- env->subprog_info[i].start -= cnt;
-
- return 0;
-}
-
-static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
- u32 cnt)
-{
- struct bpf_prog *prog = env->prog;
- u32 i, l_off, l_cnt, nr_linfo;
- struct bpf_line_info *linfo;
-
- nr_linfo = prog->aux->nr_linfo;
- if (!nr_linfo)
- return 0;
-
- linfo = prog->aux->linfo;
-
- /* find first line info to remove, count lines to be removed */
- for (i = 0; i < nr_linfo; i++)
- if (linfo[i].insn_off >= off)
- break;
-
- l_off = i;
- l_cnt = 0;
- for (; i < nr_linfo; i++)
- if (linfo[i].insn_off < off + cnt)
- l_cnt++;
- else
- break;
-
- /* First live insn doesn't match first live linfo, it needs to "inherit"
- * last removed linfo. prog is already modified, so prog->len == off
- * means no live instructions after (tail of the program was removed).
- */
- if (prog->len != off && l_cnt &&
- (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
- l_cnt--;
- linfo[--i].insn_off = off + cnt;
- }
-
- /* remove the line info which refer to the removed instructions */
- if (l_cnt) {
- memmove(linfo + l_off, linfo + i,
- sizeof(*linfo) * (nr_linfo - i));
-
- prog->aux->nr_linfo -= l_cnt;
- nr_linfo = prog->aux->nr_linfo;
- }
-
- /* pull all linfo[i].insn_off >= off + cnt in by cnt */
- for (i = l_off; i < nr_linfo; i++)
- linfo[i].insn_off -= cnt;
-
- /* fix up all subprogs (incl. 'exit') which start >= off */
- for (i = 0; i <= env->subprog_cnt; i++)
- if (env->subprog_info[i].linfo_idx > l_off) {
- /* program may have started in the removed region but
- * may not be fully removed
- */
- if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
- env->subprog_info[i].linfo_idx -= l_cnt;
- else
- env->subprog_info[i].linfo_idx = l_off;
- }
-
- return 0;
-}
-
-/*
- * Clean up dynamically allocated fields of aux data for instructions [start, ...]
- */
-static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
-{
- struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- struct bpf_insn *insns = env->prog->insnsi;
- int end = start + len;
- int i;
-
- for (i = start; i < end; i++) {
- if (aux_data[i].jt) {
- kvfree(aux_data[i].jt);
- aux_data[i].jt = NULL;
- }
-
- if (bpf_is_ldimm64(&insns[i]))
- i++;
- }
-}
-
-static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
-{
- struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- unsigned int orig_prog_len = env->prog->len;
- int err;
-
- if (bpf_prog_is_offloaded(env->prog->aux))
- bpf_prog_offload_remove_insns(env, off, cnt);
-
- /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
- clear_insn_aux_data(env, off, cnt);
-
- err = bpf_remove_insns(env->prog, off, cnt);
- if (err)
- return err;
-
- err = adjust_subprog_starts_after_remove(env, off, cnt);
- if (err)
- return err;
-
- err = bpf_adj_linfo_after_remove(env, off, cnt);
- if (err)
- return err;
-
- adjust_insn_arrays_after_remove(env, off, cnt);
-
- memmove(aux_data + off, aux_data + off + cnt,
- sizeof(*aux_data) * (orig_prog_len - off - cnt));
-
- return 0;
-}
/* The verifier does more data flow analysis than llvm and will not
* explore branches that are dead at run time. Malicious programs can
@@ -22216,2210 +18626,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
}
}
-static bool insn_is_cond_jump(u8 code)
-{
- u8 op;
-
- op = BPF_OP(code);
- if (BPF_CLASS(code) == BPF_JMP32)
- return op != BPF_JA;
-
- if (BPF_CLASS(code) != BPF_JMP)
- return false;
-
- return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
-}
-
-static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
-{
- struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
- struct bpf_insn *insn = env->prog->insnsi;
- const int insn_cnt = env->prog->len;
- int i;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- if (!insn_is_cond_jump(insn->code))
- continue;
-
- if (!aux_data[i + 1].seen)
- ja.off = insn->off;
- else if (!aux_data[i + 1 + insn->off].seen)
- ja.off = 0;
- else
- continue;
-
- if (bpf_prog_is_offloaded(env->prog->aux))
- bpf_prog_offload_replace_insn(env, i, &ja);
-
- memcpy(insn, &ja, sizeof(ja));
- }
-}
-
-static int opt_remove_dead_code(struct bpf_verifier_env *env)
-{
- struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- int insn_cnt = env->prog->len;
- int i, err;
-
- for (i = 0; i < insn_cnt; i++) {
- int j;
-
- j = 0;
- while (i + j < insn_cnt && !aux_data[i + j].seen)
- j++;
- if (!j)
- continue;
-
- err = verifier_remove_insns(env, i, j);
- if (err)
- return err;
- insn_cnt = env->prog->len;
- }
-
- return 0;
-}
-
-static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
-static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
-
-static int opt_remove_nops(struct bpf_verifier_env *env)
-{
- struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
- bool is_may_goto_0, is_ja;
- int i, err;
-
- for (i = 0; i < insn_cnt; i++) {
- is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
- is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
-
- if (!is_may_goto_0 && !is_ja)
- continue;
-
- err = verifier_remove_insns(env, i, 1);
- if (err)
- return err;
- insn_cnt--;
- /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
- i -= (is_may_goto_0 && i > 0) ? 2 : 1;
- }
-
- return 0;
-}
-
-static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
- const union bpf_attr *attr)
-{
- struct bpf_insn *patch;
- /* use env->insn_buf as two independent buffers */
- struct bpf_insn *zext_patch = env->insn_buf;
- struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2];
- struct bpf_insn_aux_data *aux = env->insn_aux_data;
- int i, patch_len, delta = 0, len = env->prog->len;
- struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_prog *new_prog;
- bool rnd_hi32;
-
- rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
- zext_patch[1] = BPF_ZEXT_REG(0);
- rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
- rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
- rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
- for (i = 0; i < len; i++) {
- int adj_idx = i + delta;
- struct bpf_insn insn;
- int load_reg;
-
- insn = insns[adj_idx];
- load_reg = insn_def_regno(&insn);
- if (!aux[adj_idx].zext_dst) {
- u8 code, class;
- u32 imm_rnd;
-
- if (!rnd_hi32)
- continue;
-
- code = insn.code;
- class = BPF_CLASS(code);
- if (load_reg == -1)
- continue;
-
- /* NOTE: arg "reg" (the fourth one) is only used for
- * BPF_STX + SRC_OP, so it is safe to pass NULL
- * here.
- */
- if (is_reg64(&insn, load_reg, NULL, DST_OP)) {
- if (class == BPF_LD &&
- BPF_MODE(code) == BPF_IMM)
- i++;
- continue;
- }
-
- /* ctx load could be transformed into wider load. */
- if (class == BPF_LDX &&
- aux[adj_idx].ptr_type == PTR_TO_CTX)
- continue;
-
- imm_rnd = get_random_u32();
- rnd_hi32_patch[0] = insn;
- rnd_hi32_patch[1].imm = imm_rnd;
- rnd_hi32_patch[3].dst_reg = load_reg;
- patch = rnd_hi32_patch;
- patch_len = 4;
- goto apply_patch_buffer;
- }
-
- /* Add in an zero-extend instruction if a) the JIT has requested
- * it or b) it's a CMPXCHG.
- *
- * The latter is because: BPF_CMPXCHG always loads a value into
- * R0, therefore always zero-extends. However some archs'
- * equivalent instruction only does this load when the
- * comparison is successful. This detail of CMPXCHG is
- * orthogonal to the general zero-extension behaviour of the
- * CPU, so it's treated independently of bpf_jit_needs_zext.
- */
- if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
- continue;
-
- /* Zero-extension is done by the caller. */
- if (bpf_pseudo_kfunc_call(&insn))
- continue;
-
- if (verifier_bug_if(load_reg == -1, env,
- "zext_dst is set, but no reg is defined"))
- return -EFAULT;
-
- zext_patch[0] = insn;
- zext_patch[1].dst_reg = load_reg;
- zext_patch[1].src_reg = load_reg;
- patch = zext_patch;
- patch_len = 2;
-apply_patch_buffer:
- new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
- if (!new_prog)
- return -ENOMEM;
- env->prog = new_prog;
- insns = new_prog->insnsi;
- aux = env->insn_aux_data;
- delta += patch_len - 1;
- }
-
- return 0;
-}
-
-/* convert load instructions that access fields of a context type into a
- * sequence of instructions that access fields of the underlying structure:
- * struct __sk_buff -> struct sk_buff
- * struct bpf_sock_ops -> struct sock
- */
-static int convert_ctx_accesses(struct bpf_verifier_env *env)
-{
- struct bpf_subprog_info *subprogs = env->subprog_info;
- const struct bpf_verifier_ops *ops = env->ops;
- int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
- const int insn_cnt = env->prog->len;
- struct bpf_insn *epilogue_buf = env->epilogue_buf;
- struct bpf_insn *insn_buf = env->insn_buf;
- struct bpf_insn *insn;
- u32 target_size, size_default, off;
- struct bpf_prog *new_prog;
- enum bpf_access_type type;
- bool is_narrower_load;
- int epilogue_idx = 0;
-
- if (ops->gen_epilogue) {
- epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
- -(subprogs[0].stack_depth + 8));
- if (epilogue_cnt >= INSN_BUF_SIZE) {
- verifier_bug(env, "epilogue is too long");
- return -EFAULT;
- } else if (epilogue_cnt) {
- /* Save the ARG_PTR_TO_CTX for the epilogue to use */
- cnt = 0;
- subprogs[0].stack_depth += 8;
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
- -subprogs[0].stack_depth);
- insn_buf[cnt++] = env->prog->insnsi[0];
- new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
- env->prog = new_prog;
- delta += cnt - 1;
-
- ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
- if (ret < 0)
- return ret;
- }
- }
-
- if (ops->gen_prologue || env->seen_direct_write) {
- if (!ops->gen_prologue) {
- verifier_bug(env, "gen_prologue is null");
- return -EFAULT;
- }
- cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
- env->prog);
- if (cnt >= INSN_BUF_SIZE) {
- verifier_bug(env, "prologue is too long");
- return -EFAULT;
- } else if (cnt) {
- new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- env->prog = new_prog;
- delta += cnt - 1;
-
- ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
- if (ret < 0)
- return ret;
- }
- }
-
- if (delta)
- WARN_ON(adjust_jmp_off(env->prog, 0, delta));
-
- if (bpf_prog_is_offloaded(env->prog->aux))
- return 0;
-
- insn = env->prog->insnsi + delta;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- bpf_convert_ctx_access_t convert_ctx_access;
- u8 mode;
-
- if (env->insn_aux_data[i + delta].nospec) {
- WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
- struct bpf_insn *patch = insn_buf;
-
- *patch++ = BPF_ST_NOSPEC();
- *patch++ = *insn;
- cnt = patch - insn_buf;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- /* This can not be easily merged with the
- * nospec_result-case, because an insn may require a
- * nospec before and after itself. Therefore also do not
- * 'continue' here but potentially apply further
- * patching to insn. *insn should equal patch[1] now.
- */
- }
-
- if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
- insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
- insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
- insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
- type = BPF_READ;
- } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
- insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
- insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
- insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
- insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
- type = BPF_WRITE;
- } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
- insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
- insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
- insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
- env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
- insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
- env->prog->aux->num_exentries++;
- continue;
- } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
- epilogue_cnt &&
- i + delta < subprogs[1].start) {
- /* Generate epilogue for the main prog */
- if (epilogue_idx) {
- /* jump back to the earlier generated epilogue */
- insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
- cnt = 1;
- } else {
- memcpy(insn_buf, epilogue_buf,
- epilogue_cnt * sizeof(*epilogue_buf));
- cnt = epilogue_cnt;
- /* epilogue_idx cannot be 0. It must have at
- * least one ctx ptr saving insn before the
- * epilogue.
- */
- epilogue_idx = i + delta;
- }
- goto patch_insn_buf;
- } else {
- continue;
- }
-
- if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].nospec_result) {
- /* nospec_result is only used to mitigate Spectre v4 and
- * to limit verification-time for Spectre v1.
- */
- struct bpf_insn *patch = insn_buf;
-
- *patch++ = *insn;
- *patch++ = BPF_ST_NOSPEC();
- cnt = patch - insn_buf;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- continue;
- }
-
- switch ((int)env->insn_aux_data[i + delta].ptr_type) {
- case PTR_TO_CTX:
- if (!ops->convert_ctx_access)
- continue;
- convert_ctx_access = ops->convert_ctx_access;
- break;
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- convert_ctx_access = bpf_sock_convert_ctx_access;
- break;
- case PTR_TO_TCP_SOCK:
- convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
- break;
- case PTR_TO_XDP_SOCK:
- convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
- break;
- case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID | PTR_UNTRUSTED:
- /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
- * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
- * be said once it is marked PTR_UNTRUSTED, hence we must handle
- * any faults for loads into such types. BPF_WRITE is disallowed
- * for this case.
- */
- case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
- case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED:
- if (type == BPF_READ) {
- if (BPF_MODE(insn->code) == BPF_MEM)
- insn->code = BPF_LDX | BPF_PROBE_MEM |
- BPF_SIZE((insn)->code);
- else
- insn->code = BPF_LDX | BPF_PROBE_MEMSX |
- BPF_SIZE((insn)->code);
- env->prog->aux->num_exentries++;
- }
- continue;
- case PTR_TO_ARENA:
- if (BPF_MODE(insn->code) == BPF_MEMSX) {
- if (!bpf_jit_supports_insn(insn, true)) {
- verbose(env, "sign extending loads from arena are not supported yet\n");
- return -EOPNOTSUPP;
- }
- insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
- } else {
- insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
- }
- env->prog->aux->num_exentries++;
- continue;
- default:
- continue;
- }
-
- ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
- size = BPF_LDST_BYTES(insn);
- mode = BPF_MODE(insn->code);
-
- /* If the read access is a narrower load of the field,
- * convert to a 4/8-byte load, to minimum program type specific
- * convert_ctx_access changes. If conversion is successful,
- * we will apply proper mask to the result.
- */
- is_narrower_load = size < ctx_field_size;
- size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
- off = insn->off;
- if (is_narrower_load) {
- u8 size_code;
-
- if (type == BPF_WRITE) {
- verifier_bug(env, "narrow ctx access misconfigured");
- return -EFAULT;
- }
-
- size_code = BPF_H;
- if (ctx_field_size == 4)
- size_code = BPF_W;
- else if (ctx_field_size == 8)
- size_code = BPF_DW;
-
- insn->off = off & ~(size_default - 1);
- insn->code = BPF_LDX | BPF_MEM | size_code;
- }
-
- target_size = 0;
- cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
- &target_size);
- if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
- (ctx_field_size && !target_size)) {
- verifier_bug(env, "error during ctx access conversion (%d)", cnt);
- return -EFAULT;
- }
-
- if (is_narrower_load && size < target_size) {
- u8 shift = bpf_ctx_narrow_access_offset(
- off, size, size_default) * 8;
- if (shift && cnt + 1 >= INSN_BUF_SIZE) {
- verifier_bug(env, "narrow ctx load misconfigured");
- return -EFAULT;
- }
- if (ctx_field_size <= 4) {
- if (shift)
- insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
- insn->dst_reg,
- shift);
- insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
- (1 << size * 8) - 1);
- } else {
- if (shift)
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
- insn->dst_reg,
- shift);
- insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
- (1ULL << size * 8) - 1);
- }
- }
- if (mode == BPF_MEMSX)
- insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
- insn->dst_reg, insn->dst_reg,
- size * 8, 0);
-
-patch_insn_buf:
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
-
- /* keep walking new program and skip insns we just inserted */
- env->prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- }
-
- return 0;
-}
-
-static int jit_subprogs(struct bpf_verifier_env *env)
-{
- struct bpf_prog *prog = env->prog, **func, *tmp;
- int i, j, subprog_start, subprog_end = 0, len, subprog;
- struct bpf_map *map_ptr;
- struct bpf_insn *insn;
- void *old_bpf_func;
- int err, num_exentries;
- int old_len, subprog_start_adjustment = 0;
-
- if (env->subprog_cnt <= 1)
- return 0;
-
- for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
- continue;
-
- /* Upon error here we cannot fall back to interpreter but
- * need a hard reject of the program. Thus -EFAULT is
- * propagated in any case.
- */
- subprog = find_subprog(env, i + insn->imm + 1);
- if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
- i + insn->imm + 1))
- return -EFAULT;
- /* temporarily remember subprog id inside insn instead of
- * aux_data, since next loop will split up all insns into funcs
- */
- insn->off = subprog;
- /* remember original imm in case JIT fails and fallback
- * to interpreter will be needed
- */
- env->insn_aux_data[i].call_imm = insn->imm;
- /* point imm to __bpf_call_base+1 from JITs point of view */
- insn->imm = 1;
- if (bpf_pseudo_func(insn)) {
-#if defined(MODULES_VADDR)
- u64 addr = MODULES_VADDR;
-#else
- u64 addr = VMALLOC_START;
-#endif
- /* jit (e.g. x86_64) may emit fewer instructions
- * if it learns a u32 imm is the same as a u64 imm.
- * Set close enough to possible prog address.
- */
- insn[0].imm = (u32)addr;
- insn[1].imm = addr >> 32;
- }
- }
-
- err = bpf_prog_alloc_jited_linfo(prog);
- if (err)
- goto out_undo_insn;
-
- err = -ENOMEM;
- func = kzalloc_objs(prog, env->subprog_cnt);
- if (!func)
- goto out_undo_insn;
-
- for (i = 0; i < env->subprog_cnt; i++) {
- subprog_start = subprog_end;
- subprog_end = env->subprog_info[i + 1].start;
-
- len = subprog_end - subprog_start;
- /* bpf_prog_run() doesn't call subprogs directly,
- * hence main prog stats include the runtime of subprogs.
- * subprogs don't have IDs and not reachable via prog_get_next_id
- * func[i]->stats will never be accessed and stays NULL
- */
- func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
- if (!func[i])
- goto out_free;
- memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
- len * sizeof(struct bpf_insn));
- func[i]->type = prog->type;
- func[i]->len = len;
- if (bpf_prog_calc_tag(func[i]))
- goto out_free;
- func[i]->is_func = 1;
- func[i]->sleepable = prog->sleepable;
- func[i]->aux->func_idx = i;
- /* Below members will be freed only at prog->aux */
- func[i]->aux->btf = prog->aux->btf;
- func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment;
- func[i]->aux->func_info = prog->aux->func_info;
- func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
- func[i]->aux->poke_tab = prog->aux->poke_tab;
- func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
- func[i]->aux->main_prog_aux = prog->aux;
-
- for (j = 0; j < prog->aux->size_poke_tab; j++) {
- struct bpf_jit_poke_descriptor *poke;
-
- poke = &prog->aux->poke_tab[j];
- if (poke->insn_idx < subprog_end &&
- poke->insn_idx >= subprog_start)
- poke->aux = func[i]->aux;
- }
-
- func[i]->aux->name[0] = 'F';
- func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
- if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
- func[i]->aux->jits_use_priv_stack = true;
-
- func[i]->jit_requested = 1;
- func[i]->blinding_requested = prog->blinding_requested;
- func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
- func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
- func[i]->aux->linfo = prog->aux->linfo;
- func[i]->aux->nr_linfo = prog->aux->nr_linfo;
- func[i]->aux->jited_linfo = prog->aux->jited_linfo;
- func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
- func[i]->aux->arena = prog->aux->arena;
- func[i]->aux->used_maps = env->used_maps;
- func[i]->aux->used_map_cnt = env->used_map_cnt;
- num_exentries = 0;
- insn = func[i]->insnsi;
- for (j = 0; j < func[i]->len; j++, insn++) {
- if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
- BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
- BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
- BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
- num_exentries++;
- if ((BPF_CLASS(insn->code) == BPF_STX ||
- BPF_CLASS(insn->code) == BPF_ST) &&
- BPF_MODE(insn->code) == BPF_PROBE_MEM32)
- num_exentries++;
- if (BPF_CLASS(insn->code) == BPF_STX &&
- BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
- num_exentries++;
- }
- func[i]->aux->num_exentries = num_exentries;
- func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
- func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
- func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
- func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
- if (!i)
- func[i]->aux->exception_boundary = env->seen_exception;
-
- /*
- * To properly pass the absolute subprog start to jit
- * all instruction adjustments should be accumulated
- */
- old_len = func[i]->len;
- func[i] = bpf_int_jit_compile(func[i]);
- subprog_start_adjustment += func[i]->len - old_len;
-
- if (!func[i]->jited) {
- err = -ENOTSUPP;
- goto out_free;
- }
- cond_resched();
- }
-
- /* at this point all bpf functions were successfully JITed
- * now populate all bpf_calls with correct addresses and
- * run last pass of JIT
- */
- for (i = 0; i < env->subprog_cnt; i++) {
- insn = func[i]->insnsi;
- for (j = 0; j < func[i]->len; j++, insn++) {
- if (bpf_pseudo_func(insn)) {
- subprog = insn->off;
- insn[0].imm = (u32)(long)func[subprog]->bpf_func;
- insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
- continue;
- }
- if (!bpf_pseudo_call(insn))
- continue;
- subprog = insn->off;
- insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
- }
-
- /* we use the aux data to keep a list of the start addresses
- * of the JITed images for each function in the program
- *
- * for some architectures, such as powerpc64, the imm field
- * might not be large enough to hold the offset of the start
- * address of the callee's JITed image from __bpf_call_base
- *
- * in such cases, we can lookup the start address of a callee
- * by using its subprog id, available from the off field of
- * the call instruction, as an index for this list
- */
- func[i]->aux->func = func;
- func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
- func[i]->aux->real_func_cnt = env->subprog_cnt;
- }
- for (i = 0; i < env->subprog_cnt; i++) {
- old_bpf_func = func[i]->bpf_func;
- tmp = bpf_int_jit_compile(func[i]);
- if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
- verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
- err = -ENOTSUPP;
- goto out_free;
- }
- cond_resched();
- }
-
- /*
- * Cleanup func[i]->aux fields which aren't required
- * or can become invalid in future
- */
- for (i = 0; i < env->subprog_cnt; i++) {
- func[i]->aux->used_maps = NULL;
- func[i]->aux->used_map_cnt = 0;
- }
-
- /* finally lock prog and jit images for all functions and
- * populate kallsysm. Begin at the first subprogram, since
- * bpf_prog_load will add the kallsyms for the main program.
- */
- for (i = 1; i < env->subprog_cnt; i++) {
- err = bpf_prog_lock_ro(func[i]);
- if (err)
- goto out_free;
- }
-
- for (i = 1; i < env->subprog_cnt; i++)
- bpf_prog_kallsyms_add(func[i]);
-
- /* Last step: make now unused interpreter insns from main
- * prog consistent for later dump requests, so they can
- * later look the same as if they were interpreted only.
- */
- for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (bpf_pseudo_func(insn)) {
- insn[0].imm = env->insn_aux_data[i].call_imm;
- insn[1].imm = insn->off;
- insn->off = 0;
- continue;
- }
- if (!bpf_pseudo_call(insn))
- continue;
- insn->off = env->insn_aux_data[i].call_imm;
- subprog = find_subprog(env, i + insn->off + 1);
- insn->imm = subprog;
- }
-
- prog->jited = 1;
- prog->bpf_func = func[0]->bpf_func;
- prog->jited_len = func[0]->jited_len;
- prog->aux->extable = func[0]->aux->extable;
- prog->aux->num_exentries = func[0]->aux->num_exentries;
- prog->aux->func = func;
- prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
- prog->aux->real_func_cnt = env->subprog_cnt;
- prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
- prog->aux->exception_boundary = func[0]->aux->exception_boundary;
- bpf_prog_jit_attempt_done(prog);
- return 0;
-out_free:
- /* We failed JIT'ing, so at this point we need to unregister poke
- * descriptors from subprogs, so that kernel is not attempting to
- * patch it anymore as we're freeing the subprog JIT memory.
- */
- for (i = 0; i < prog->aux->size_poke_tab; i++) {
- map_ptr = prog->aux->poke_tab[i].tail_call.map;
- map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
- }
- /* At this point we're guaranteed that poke descriptors are not
- * live anymore. We can just unlink its descriptor table as it's
- * released with the main prog.
- */
- for (i = 0; i < env->subprog_cnt; i++) {
- if (!func[i])
- continue;
- func[i]->aux->poke_tab = NULL;
- bpf_jit_free(func[i]);
- }
- kfree(func);
-out_undo_insn:
- /* cleanup main prog to be interpreted */
- prog->jit_requested = 0;
- prog->blinding_requested = 0;
- for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (!bpf_pseudo_call(insn))
- continue;
- insn->off = 0;
- insn->imm = env->insn_aux_data[i].call_imm;
- }
- bpf_prog_jit_attempt_done(prog);
- return err;
-}
-
-static int fixup_call_args(struct bpf_verifier_env *env)
-{
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
- struct bpf_prog *prog = env->prog;
- struct bpf_insn *insn = prog->insnsi;
- bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
- int i, depth;
-#endif
- int err = 0;
-
- if (env->prog->jit_requested &&
- !bpf_prog_is_offloaded(env->prog->aux)) {
- err = jit_subprogs(env);
- if (err == 0)
- return 0;
- if (err == -EFAULT)
- return err;
- }
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
- if (has_kfunc_call) {
- verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
- return -EINVAL;
- }
- if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
- /* When JIT fails the progs with bpf2bpf calls and tail_calls
- * have to be rejected, since interpreter doesn't support them yet.
- */
- verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
- return -EINVAL;
- }
- for (i = 0; i < prog->len; i++, insn++) {
- if (bpf_pseudo_func(insn)) {
- /* When JIT fails the progs with callback calls
- * have to be rejected, since interpreter doesn't support them yet.
- */
- verbose(env, "callbacks are not allowed in non-JITed programs\n");
- return -EINVAL;
- }
-
- if (!bpf_pseudo_call(insn))
- continue;
- depth = get_callee_stack_depth(env, insn, i);
- if (depth < 0)
- return depth;
- bpf_patch_call_args(insn, depth);
- }
- err = 0;
-#endif
- return err;
-}
-
-/* replace a generic kfunc with a specialized version if necessary */
-static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
-{
- struct bpf_prog *prog = env->prog;
- bool seen_direct_write;
- void *xdp_kfunc;
- bool is_rdonly;
- u32 func_id = desc->func_id;
- u16 offset = desc->offset;
- unsigned long addr = desc->addr;
-
- if (offset) /* return if module BTF is used */
- return 0;
-
- if (bpf_dev_bound_kfunc_id(func_id)) {
- xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
- if (xdp_kfunc)
- addr = (unsigned long)xdp_kfunc;
- /* fallback to default kfunc when not supported by netdev */
- } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
- seen_direct_write = env->seen_direct_write;
- is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
-
- if (is_rdonly)
- addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
-
- /* restore env->seen_direct_write to its original value, since
- * may_access_direct_pkt_data mutates it
- */
- env->seen_direct_write = seen_direct_write;
- } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
- if (bpf_lsm_has_d_inode_locked(prog))
- addr = (unsigned long)bpf_set_dentry_xattr_locked;
- } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
- if (bpf_lsm_has_d_inode_locked(prog))
- addr = (unsigned long)bpf_remove_dentry_xattr_locked;
- } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
- if (!env->insn_aux_data[insn_idx].non_sleepable)
- addr = (unsigned long)bpf_dynptr_from_file_sleepable;
- } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) {
- if (env->insn_aux_data[insn_idx].non_sleepable)
- addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable;
- } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) {
- if (env->insn_aux_data[insn_idx].non_sleepable)
- addr = (unsigned long)bpf_arena_free_pages_non_sleepable;
- }
- desc->addr = addr;
- return 0;
-}
-
-static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
- u16 struct_meta_reg,
- u16 node_offset_reg,
- struct bpf_insn *insn,
- struct bpf_insn *insn_buf,
- int *cnt)
-{
- struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
- struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
-
- insn_buf[0] = addr[0];
- insn_buf[1] = addr[1];
- insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
- insn_buf[3] = *insn;
- *cnt = 4;
-}
-
-static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- struct bpf_insn *insn_buf, int insn_idx, int *cnt)
-{
- struct bpf_kfunc_desc *desc;
- int err;
-
- if (!insn->imm) {
- verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
- return -EINVAL;
- }
-
- *cnt = 0;
-
- /* insn->imm has the btf func_id. Replace it with an offset relative to
- * __bpf_call_base, unless the JIT needs to call functions that are
- * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
- */
- desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
- if (!desc) {
- verifier_bug(env, "kernel function descriptor not found for func_id %u",
- insn->imm);
- return -EFAULT;
- }
-
- err = specialize_kfunc(env, desc, insn_idx);
- if (err)
- return err;
-
- if (!bpf_jit_supports_far_kfunc_call())
- insn->imm = BPF_CALL_IMM(desc->addr);
-
- if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
- struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
- u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
-
- if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
- verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
- insn_idx);
- return -EFAULT;
- }
-
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
- insn_buf[1] = addr[0];
- insn_buf[2] = addr[1];
- insn_buf[3] = *insn;
- *cnt = 4;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
- struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
- struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
-
- if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
- verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
- insn_idx);
- return -EFAULT;
- }
-
- if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
- !kptr_struct_meta) {
- verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
- insn_idx);
- return -EFAULT;
- }
-
- insn_buf[0] = addr[0];
- insn_buf[1] = addr[1];
- insn_buf[2] = *insn;
- *cnt = 3;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
- struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
- int struct_meta_reg = BPF_REG_3;
- int node_offset_reg = BPF_REG_4;
-
- /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
- if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
- struct_meta_reg = BPF_REG_4;
- node_offset_reg = BPF_REG_5;
- }
-
- if (!kptr_struct_meta) {
- verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
- insn_idx);
- return -EFAULT;
- }
-
- __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
- node_offset_reg, insn, insn_buf, cnt);
- } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
- desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
- insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
- *cnt = 1;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
- env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
- /*
- * inline the bpf_session_is_return() for fsession:
- * bool bpf_session_is_return(void *ctx)
- * {
- * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1;
- * }
- */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT);
- insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
- *cnt = 3;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
- env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
- /*
- * inline bpf_session_cookie() for fsession:
- * __u64 *bpf_session_cookie(void *ctx)
- * {
- * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF;
- * return &((u64 *)ctx)[-off];
- * }
- */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT);
- insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
- insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
- insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1);
- insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0);
- *cnt = 6;
- }
-
- if (env->insn_aux_data[insn_idx].arg_prog) {
- u32 regno = env->insn_aux_data[insn_idx].arg_prog;
- struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
- int idx = *cnt;
-
- insn_buf[idx++] = ld_addrs[0];
- insn_buf[idx++] = ld_addrs[1];
- insn_buf[idx++] = *insn;
- *cnt = idx;
- }
- return 0;
-}
-
-/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
-static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
-{
- struct bpf_subprog_info *info = env->subprog_info;
- int cnt = env->subprog_cnt;
- struct bpf_prog *prog;
-
- /* We only reserve one slot for hidden subprogs in subprog_info. */
- if (env->hidden_subprog_cnt) {
- verifier_bug(env, "only one hidden subprog supported");
- return -EFAULT;
- }
- /* We're not patching any existing instruction, just appending the new
- * ones for the hidden subprog. Hence all of the adjustment operations
- * in bpf_patch_insn_data are no-ops.
- */
- prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
- if (!prog)
- return -ENOMEM;
- env->prog = prog;
- info[cnt + 1].start = info[cnt].start;
- info[cnt].start = prog->len - len + 1;
- env->subprog_cnt++;
- env->hidden_subprog_cnt++;
- return 0;
-}
-
-/* Do various post-verification rewrites in a single program pass.
- * These rewrites simplify JIT and interpreter implementations.
- */
-static int do_misc_fixups(struct bpf_verifier_env *env)
-{
- struct bpf_prog *prog = env->prog;
- enum bpf_attach_type eatype = prog->expected_attach_type;
- enum bpf_prog_type prog_type = resolve_prog_type(prog);
- struct bpf_insn *insn = prog->insnsi;
- const struct bpf_func_proto *fn;
- const int insn_cnt = prog->len;
- const struct bpf_map_ops *ops;
- struct bpf_insn_aux_data *aux;
- struct bpf_insn *insn_buf = env->insn_buf;
- struct bpf_prog *new_prog;
- struct bpf_map *map_ptr;
- int i, ret, cnt, delta = 0, cur_subprog = 0;
- struct bpf_subprog_info *subprogs = env->subprog_info;
- u16 stack_depth = subprogs[cur_subprog].stack_depth;
- u16 stack_depth_extra = 0;
-
- if (env->seen_exception && !env->exception_callback_subprog) {
- struct bpf_insn *patch = insn_buf;
-
- *patch++ = env->prog->insnsi[insn_cnt - 1];
- *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
- *patch++ = BPF_EXIT_INSN();
- ret = add_hidden_subprog(env, insn_buf, patch - insn_buf);
- if (ret < 0)
- return ret;
- prog = env->prog;
- insn = prog->insnsi;
-
- env->exception_callback_subprog = env->subprog_cnt - 1;
- /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
- mark_subprog_exc_cb(env, env->exception_callback_subprog);
- }
-
- for (i = 0; i < insn_cnt;) {
- if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
- if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
- (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
- /* convert to 32-bit mov that clears upper 32-bit */
- insn->code = BPF_ALU | BPF_MOV | BPF_X;
- /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
- insn->off = 0;
- insn->imm = 0;
- } /* cast from as(0) to as(1) should be handled by JIT */
- goto next_insn;
- }
-
- if (env->insn_aux_data[i + delta].needs_zext)
- /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
- insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
-
- /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
- if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
- insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
- insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
- insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
- insn->off == 1 && insn->imm == -1) {
- bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
- bool isdiv = BPF_OP(insn->code) == BPF_DIV;
- struct bpf_insn *patch = insn_buf;
-
- if (isdiv)
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_NEG | BPF_K, insn->dst_reg,
- 0, 0, 0);
- else
- *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
-
- cnt = patch - insn_buf;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
- if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
- insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
- insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
- insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
- bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
- bool isdiv = BPF_OP(insn->code) == BPF_DIV;
- bool is_sdiv = isdiv && insn->off == 1;
- bool is_smod = !isdiv && insn->off == 1;
- struct bpf_insn *patch = insn_buf;
-
- if (is_sdiv) {
- /* [R,W]x sdiv 0 -> 0
- * LLONG_MIN sdiv -1 -> LLONG_MIN
- * INT_MIN sdiv -1 -> INT_MIN
- */
- *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_ADD | BPF_K, BPF_REG_AX,
- 0, 0, 1);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JGT | BPF_K, BPF_REG_AX,
- 0, 4, 1);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, BPF_REG_AX,
- 0, 1, 0);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_MOV | BPF_K, insn->dst_reg,
- 0, 0, 0);
- /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_NEG | BPF_K, insn->dst_reg,
- 0, 0, 0);
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = *insn;
- cnt = patch - insn_buf;
- } else if (is_smod) {
- /* [R,W]x mod 0 -> [R,W]x */
- /* [R,W]x mod -1 -> 0 */
- *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_ADD | BPF_K, BPF_REG_AX,
- 0, 0, 1);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JGT | BPF_K, BPF_REG_AX,
- 0, 3, 1);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, BPF_REG_AX,
- 0, 3 + (is64 ? 0 : 1), 1);
- *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = *insn;
-
- if (!is64) {
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
- }
- cnt = patch - insn_buf;
- } else if (isdiv) {
- /* [R,W]x div 0 -> 0 */
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JNE | BPF_K, insn->src_reg,
- 0, 2, 0);
- *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg);
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = *insn;
- cnt = patch - insn_buf;
- } else {
- /* [R,W]x mod 0 -> [R,W]x */
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, insn->src_reg,
- 0, 1 + (is64 ? 0 : 1), 0);
- *patch++ = *insn;
-
- if (!is64) {
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
- }
- cnt = patch - insn_buf;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Make it impossible to de-reference a userspace address */
- if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
- BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
- struct bpf_insn *patch = insn_buf;
- u64 uaddress_limit = bpf_arch_uaddress_limit();
-
- if (!uaddress_limit)
- goto next_insn;
-
- *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
- if (insn->off)
- *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
- *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
- *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
- *patch++ = *insn;
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
-
- cnt = patch - insn_buf;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
- if (BPF_CLASS(insn->code) == BPF_LD &&
- (BPF_MODE(insn->code) == BPF_ABS ||
- BPF_MODE(insn->code) == BPF_IND)) {
- cnt = env->ops->gen_ld_abs(insn, insn_buf);
- if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
- verifier_bug(env, "%d insns generated for ld_abs", cnt);
- return -EFAULT;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Rewrite pointer arithmetic to mitigate speculation attacks. */
- if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
- insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
- const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
- const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
- struct bpf_insn *patch = insn_buf;
- bool issrc, isneg, isimm;
- u32 off_reg;
-
- aux = &env->insn_aux_data[i + delta];
- if (!aux->alu_state ||
- aux->alu_state == BPF_ALU_NON_POINTER)
- goto next_insn;
-
- isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
- issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
- BPF_ALU_SANITIZE_SRC;
- isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
-
- off_reg = issrc ? insn->src_reg : insn->dst_reg;
- if (isimm) {
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
- } else {
- if (isneg)
- *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
- *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
- *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
- *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
- *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
- *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
- }
- if (!issrc)
- *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
- insn->src_reg = BPF_REG_AX;
- if (isneg)
- insn->code = insn->code == code_add ?
- code_sub : code_add;
- *patch++ = *insn;
- if (issrc && isneg && !isimm)
- *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- cnt = patch - insn_buf;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
- int stack_off_cnt = -stack_depth - 16;
-
- /*
- * Two 8 byte slots, depth-16 stores the count, and
- * depth-8 stores the start timestamp of the loop.
- *
- * The starting value of count is BPF_MAX_TIMED_LOOPS
- * (0xffff). Every iteration loads it and subs it by 1,
- * until the value becomes 0 in AX (thus, 1 in stack),
- * after which we call arch_bpf_timed_may_goto, which
- * either sets AX to 0xffff to keep looping, or to 0
- * upon timeout. AX is then stored into the stack. In
- * the next iteration, we either see 0 and break out, or
- * continue iterating until the next time value is 0
- * after subtraction, rinse and repeat.
- */
- stack_depth_extra = 16;
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
- if (insn->off >= 0)
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
- else
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
- insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
- insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
- /*
- * AX is used as an argument to pass in stack_off_cnt
- * (to add to r10/fp), and also as the return value of
- * the call to arch_bpf_timed_may_goto.
- */
- insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
- insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
- insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
- cnt = 7;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- } else if (is_may_goto_insn(insn)) {
- int stack_off = -stack_depth - 8;
-
- stack_depth_extra = 8;
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
- if (insn->off >= 0)
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
- else
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
- insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
- insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
- cnt = 4;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- if (insn->code != (BPF_JMP | BPF_CALL))
- goto next_insn;
- if (insn->src_reg == BPF_PSEUDO_CALL)
- goto next_insn;
- if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
- if (ret)
- return ret;
- if (cnt == 0)
- goto next_insn;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Skip inlining the helper call if the JIT does it. */
- if (bpf_jit_inlines_helper_call(insn->imm))
- goto next_insn;
-
- if (insn->imm == BPF_FUNC_get_route_realm)
- prog->dst_needed = 1;
- if (insn->imm == BPF_FUNC_get_prandom_u32)
- bpf_user_rnd_init_once();
- if (insn->imm == BPF_FUNC_override_return)
- prog->kprobe_override = 1;
- if (insn->imm == BPF_FUNC_tail_call) {
- /* If we tail call into other programs, we
- * cannot make any assumptions since they can
- * be replaced dynamically during runtime in
- * the program array.
- */
- prog->cb_access = 1;
- if (!allow_tail_call_in_subprogs(env))
- prog->aux->stack_depth = MAX_BPF_STACK;
- prog->aux->max_pkt_offset = MAX_PACKET_OFF;
-
- /* mark bpf_tail_call as different opcode to avoid
- * conditional branch in the interpreter for every normal
- * call and to prevent accidental JITing by JIT compiler
- * that doesn't support bpf_tail_call yet
- */
- insn->imm = 0;
- insn->code = BPF_JMP | BPF_TAIL_CALL;
-
- aux = &env->insn_aux_data[i + delta];
- if (env->bpf_capable && !prog->blinding_requested &&
- prog->jit_requested &&
- !bpf_map_key_poisoned(aux) &&
- !bpf_map_ptr_poisoned(aux) &&
- !bpf_map_ptr_unpriv(aux)) {
- struct bpf_jit_poke_descriptor desc = {
- .reason = BPF_POKE_REASON_TAIL_CALL,
- .tail_call.map = aux->map_ptr_state.map_ptr,
- .tail_call.key = bpf_map_key_immediate(aux),
- .insn_idx = i + delta,
- };
-
- ret = bpf_jit_add_poke_descriptor(prog, &desc);
- if (ret < 0) {
- verbose(env, "adding tail call poke descriptor failed\n");
- return ret;
- }
-
- insn->imm = ret + 1;
- goto next_insn;
- }
-
- if (!bpf_map_ptr_unpriv(aux))
- goto next_insn;
-
- /* instead of changing every JIT dealing with tail_call
- * emit two extra insns:
- * if (index >= max_entries) goto out;
- * index &= array->index_mask;
- * to avoid out-of-bounds cpu speculation
- */
- if (bpf_map_ptr_poisoned(aux)) {
- verbose(env, "tail_call abusing map_ptr\n");
- return -EINVAL;
- }
-
- map_ptr = aux->map_ptr_state.map_ptr;
- insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
- map_ptr->max_entries, 2);
- insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
- container_of(map_ptr,
- struct bpf_array,
- map)->index_mask);
- insn_buf[2] = *insn;
- cnt = 3;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- if (insn->imm == BPF_FUNC_timer_set_callback) {
- /* The verifier will process callback_fn as many times as necessary
- * with different maps and the register states prepared by
- * set_timer_callback_state will be accurate.
- *
- * The following use case is valid:
- * map1 is shared by prog1, prog2, prog3.
- * prog1 calls bpf_timer_init for some map1 elements
- * prog2 calls bpf_timer_set_callback for some map1 elements.
- * Those that were not bpf_timer_init-ed will return -EINVAL.
- * prog3 calls bpf_timer_start for some map1 elements.
- * Those that were not both bpf_timer_init-ed and
- * bpf_timer_set_callback-ed will return -EINVAL.
- */
- struct bpf_insn ld_addrs[2] = {
- BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
- };
-
- insn_buf[0] = ld_addrs[0];
- insn_buf[1] = ld_addrs[1];
- insn_buf[2] = *insn;
- cnt = 3;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto patch_call_imm;
- }
-
- if (is_storage_get_function(insn->imm)) {
- if (env->insn_aux_data[i + delta].non_sleepable)
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
- else
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
- insn_buf[1] = *insn;
- cnt = 2;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto patch_call_imm;
- }
-
- /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
- if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
- /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
- * bpf_mem_alloc() returns a ptr to the percpu data ptr.
- */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
- insn_buf[1] = *insn;
- cnt = 2;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto patch_call_imm;
- }
-
- /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
- * and other inlining handlers are currently limited to 64 bit
- * only.
- */
- if (prog->jit_requested && BITS_PER_LONG == 64 &&
- (insn->imm == BPF_FUNC_map_lookup_elem ||
- insn->imm == BPF_FUNC_map_update_elem ||
- insn->imm == BPF_FUNC_map_delete_elem ||
- insn->imm == BPF_FUNC_map_push_elem ||
- insn->imm == BPF_FUNC_map_pop_elem ||
- insn->imm == BPF_FUNC_map_peek_elem ||
- insn->imm == BPF_FUNC_redirect_map ||
- insn->imm == BPF_FUNC_for_each_map_elem ||
- insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
- aux = &env->insn_aux_data[i + delta];
- if (bpf_map_ptr_poisoned(aux))
- goto patch_call_imm;
-
- map_ptr = aux->map_ptr_state.map_ptr;
- ops = map_ptr->ops;
- if (insn->imm == BPF_FUNC_map_lookup_elem &&
- ops->map_gen_lookup) {
- cnt = ops->map_gen_lookup(map_ptr, insn_buf);
- if (cnt == -EOPNOTSUPP)
- goto patch_map_ops_generic;
- if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
- verifier_bug(env, "%d insns generated for map lookup", cnt);
- return -EFAULT;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta,
- insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
- (void *(*)(struct bpf_map *map, void *key))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
- (long (*)(struct bpf_map *map, void *key))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_update_elem,
- (long (*)(struct bpf_map *map, void *key, void *value,
- u64 flags))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_push_elem,
- (long (*)(struct bpf_map *map, void *value,
- u64 flags))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
- (long (*)(struct bpf_map *map, void *value))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
- (long (*)(struct bpf_map *map, void *value))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
- (long (*)(struct bpf_map *map,
- bpf_callback_t callback_fn,
- void *callback_ctx,
- u64 flags))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
- (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
-
-patch_map_ops_generic:
- switch (insn->imm) {
- case BPF_FUNC_map_lookup_elem:
- insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
- goto next_insn;
- case BPF_FUNC_map_update_elem:
- insn->imm = BPF_CALL_IMM(ops->map_update_elem);
- goto next_insn;
- case BPF_FUNC_map_delete_elem:
- insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
- goto next_insn;
- case BPF_FUNC_map_push_elem:
- insn->imm = BPF_CALL_IMM(ops->map_push_elem);
- goto next_insn;
- case BPF_FUNC_map_pop_elem:
- insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
- goto next_insn;
- case BPF_FUNC_map_peek_elem:
- insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
- goto next_insn;
- case BPF_FUNC_redirect_map:
- insn->imm = BPF_CALL_IMM(ops->map_redirect);
- goto next_insn;
- case BPF_FUNC_for_each_map_elem:
- insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
- goto next_insn;
- case BPF_FUNC_map_lookup_percpu_elem:
- insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
- goto next_insn;
- }
-
- goto patch_call_imm;
- }
-
- /* Implement bpf_jiffies64 inline. */
- if (prog->jit_requested && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_jiffies64) {
- struct bpf_insn ld_jiffies_addr[2] = {
- BPF_LD_IMM64(BPF_REG_0,
- (unsigned long)&jiffies),
- };
-
- insn_buf[0] = ld_jiffies_addr[0];
- insn_buf[1] = ld_jiffies_addr[1];
- insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
- BPF_REG_0, 0);
- cnt = 3;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
- cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
-#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
- /* Implement bpf_get_smp_processor_id() inline. */
- if (insn->imm == BPF_FUNC_get_smp_processor_id &&
- verifier_inlines_helper_call(env, insn->imm)) {
- /* BPF_FUNC_get_smp_processor_id inlining is an
- * optimization, so if cpu_number is ever
- * changed in some incompatible and hard to support
- * way, it's fine to back out this inlining logic
- */
-#ifdef CONFIG_SMP
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
- insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
- insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
- cnt = 3;
-#else
- insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
- cnt = 1;
-#endif
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */
- if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) &&
- verifier_inlines_helper_call(env, insn->imm)) {
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&current_task);
- insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
- insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
- cnt = 3;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-#endif
- /* Implement bpf_get_func_arg inline. */
- if (prog_type == BPF_PROG_TYPE_TRACING &&
- insn->imm == BPF_FUNC_get_func_arg) {
- if (eatype == BPF_TRACE_RAW_TP) {
- int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
-
- /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
- cnt = 1;
- } else {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
- cnt = 2;
- }
- insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
- insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
- insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
- insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0);
- insn_buf[cnt++] = BPF_JMP_A(1);
- insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_get_func_ret inline. */
- if (prog_type == BPF_PROG_TYPE_TRACING &&
- insn->imm == BPF_FUNC_get_func_ret) {
- if (eatype == BPF_TRACE_FEXIT ||
- eatype == BPF_TRACE_FSESSION ||
- eatype == BPF_MODIFY_RETURN) {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
- insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
- insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
- insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
- insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
- insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
- cnt = 7;
- } else {
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
- cnt = 1;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement get_func_arg_cnt inline. */
- if (prog_type == BPF_PROG_TYPE_TRACING &&
- insn->imm == BPF_FUNC_get_func_arg_cnt) {
- if (eatype == BPF_TRACE_RAW_TP) {
- int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
-
- /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
- cnt = 1;
- } else {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
- cnt = 2;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_get_func_ip inline. */
- if (prog_type == BPF_PROG_TYPE_TRACING &&
- insn->imm == BPF_FUNC_get_func_ip) {
- /* Load IP address from ctx - 16 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
- if (!new_prog)
- return -ENOMEM;
-
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_get_branch_snapshot inline. */
- if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
- prog->jit_requested && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_get_branch_snapshot) {
- /* We are dealing with the following func protos:
- * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
- * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
- */
- const u32 br_entry_size = sizeof(struct perf_branch_entry);
-
- /* struct perf_branch_entry is part of UAPI and is
- * used as an array element, so extremely unlikely to
- * ever grow or shrink
- */
- BUILD_BUG_ON(br_entry_size != 24);
-
- /* if (unlikely(flags)) return -EINVAL */
- insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
-
- /* Transform size (bytes) into number of entries (cnt = size / 24).
- * But to avoid expensive division instruction, we implement
- * divide-by-3 through multiplication, followed by further
- * division by 8 through 3-bit right shift.
- * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
- * p. 227, chapter "Unsigned Division by 3" for details and proofs.
- *
- * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
- */
- insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
- insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
- insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
-
- /* call perf_snapshot_branch_stack implementation */
- insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
- /* if (entry_cnt == 0) return -ENOENT */
- insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
- /* return entry_cnt * sizeof(struct perf_branch_entry) */
- insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
- insn_buf[7] = BPF_JMP_A(3);
- /* return -EINVAL; */
- insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
- insn_buf[9] = BPF_JMP_A(1);
- /* return -ENOENT; */
- insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
- cnt = 11;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_kptr_xchg inline */
- if (prog->jit_requested && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_kptr_xchg &&
- bpf_jit_supports_ptr_xchg()) {
- insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
- insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
- cnt = 2;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-patch_call_imm:
- fn = env->ops->get_func_proto(insn->imm, env->prog);
- /* all functions that have prototype and verifier allowed
- * programs to call them, must be real in-kernel functions
- */
- if (!fn->func) {
- verifier_bug(env,
- "not inlined functions %s#%d is missing func",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
- }
- insn->imm = fn->func - __bpf_call_base;
-next_insn:
- if (subprogs[cur_subprog + 1].start == i + delta + 1) {
- subprogs[cur_subprog].stack_depth += stack_depth_extra;
- subprogs[cur_subprog].stack_extra = stack_depth_extra;
-
- stack_depth = subprogs[cur_subprog].stack_depth;
- if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
- verbose(env, "stack size %d(extra %d) is too large\n",
- stack_depth, stack_depth_extra);
- return -EINVAL;
- }
- cur_subprog++;
- stack_depth = subprogs[cur_subprog].stack_depth;
- stack_depth_extra = 0;
- }
- i++;
- insn++;
- }
-
- env->prog->aux->stack_depth = subprogs[0].stack_depth;
- for (i = 0; i < env->subprog_cnt; i++) {
- int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
- int subprog_start = subprogs[i].start;
- int stack_slots = subprogs[i].stack_extra / 8;
- int slots = delta, cnt = 0;
-
- if (!stack_slots)
- continue;
- /* We need two slots in case timed may_goto is supported. */
- if (stack_slots > slots) {
- verifier_bug(env, "stack_slots supports may_goto only");
- return -EFAULT;
- }
-
- stack_depth = subprogs[i].stack_depth;
- if (bpf_jit_supports_timed_may_goto()) {
- insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
- BPF_MAX_TIMED_LOOPS);
- insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
- } else {
- /* Add ST insn to subprog prologue to init extra stack */
- insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
- BPF_MAX_LOOPS);
- }
- /* Copy first actual insn to preserve it */
- insn_buf[cnt++] = env->prog->insnsi[subprog_start];
-
- new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
- env->prog = prog = new_prog;
- /*
- * If may_goto is a first insn of a prog there could be a jmp
- * insn that points to it, hence adjust all such jmps to point
- * to insn after BPF_ST that inits may_goto count.
- * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
- */
- WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
- }
-
- /* Since poke tab is now finalized, publish aux to tracker. */
- for (i = 0; i < prog->aux->size_poke_tab; i++) {
- map_ptr = prog->aux->poke_tab[i].tail_call.map;
- if (!map_ptr->ops->map_poke_track ||
- !map_ptr->ops->map_poke_untrack ||
- !map_ptr->ops->map_poke_run) {
- verifier_bug(env, "poke tab is misconfigured");
- return -EFAULT;
- }
-
- ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
- if (ret < 0) {
- verbose(env, "tracking tail call prog failed\n");
- return ret;
- }
- }
-
- ret = sort_kfunc_descs_by_imm_off(env);
- if (ret)
- return ret;
-
- return 0;
-}
-
-static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
- int position,
- s32 stack_base,
- u32 callback_subprogno,
- u32 *total_cnt)
-{
- s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
- s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
- s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
- int reg_loop_max = BPF_REG_6;
- int reg_loop_cnt = BPF_REG_7;
- int reg_loop_ctx = BPF_REG_8;
-
- struct bpf_insn *insn_buf = env->insn_buf;
- struct bpf_prog *new_prog;
- u32 callback_start;
- u32 call_insn_offset;
- s32 callback_offset;
- u32 cnt = 0;
-
- /* This represents an inlined version of bpf_iter.c:bpf_loop,
- * be careful to modify this code in sync.
- */
- /* Return error and jump to the end of the patch if
- * expected number of iterations is too big.
- */
- insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
- insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
- insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
- /* spill R6, R7, R8 to use these as loop vars */
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
- /* initialize loop vars */
- insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
- insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
- insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
- /* loop header,
- * if reg_loop_cnt >= reg_loop_max skip the loop body
- */
- insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
- /* callback call,
- * correct callback offset would be set after patching
- */
- insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
- insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
- insn_buf[cnt++] = BPF_CALL_REL(0);
- /* increment loop counter */
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
- /* jump to loop header if callback returned 0 */
- insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
- /* return value of bpf_loop,
- * set R0 to the number of iterations
- */
- insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
- /* restore original values of R6, R7, R8 */
- insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
- insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
- insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
-
- *total_cnt = cnt;
- new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
- if (!new_prog)
- return new_prog;
-
- /* callback start is known only after patching */
- callback_start = env->subprog_info[callback_subprogno].start;
- /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
- call_insn_offset = position + 12;
- callback_offset = callback_start - call_insn_offset - 1;
- new_prog->insnsi[call_insn_offset].imm = callback_offset;
-
- return new_prog;
-}
-
-static bool is_bpf_loop_call(struct bpf_insn *insn)
-{
- return insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == 0 &&
- insn->imm == BPF_FUNC_loop;
-}
-
-/* For all sub-programs in the program (including main) check
- * insn_aux_data to see if there are bpf_loop calls that require
- * inlining. If such calls are found the calls are replaced with a
- * sequence of instructions produced by `inline_bpf_loop` function and
- * subprog stack_depth is increased by the size of 3 registers.
- * This stack space is used to spill values of the R6, R7, R8. These
- * registers are used to store the loop bound, counter and context
- * variables.
- */
-static int optimize_bpf_loop(struct bpf_verifier_env *env)
-{
- struct bpf_subprog_info *subprogs = env->subprog_info;
- int i, cur_subprog = 0, cnt, delta = 0;
- struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
- u16 stack_depth = subprogs[cur_subprog].stack_depth;
- u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
- u16 stack_depth_extra = 0;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- struct bpf_loop_inline_state *inline_state =
- &env->insn_aux_data[i + delta].loop_inline_state;
-
- if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
- struct bpf_prog *new_prog;
-
- stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
- new_prog = inline_bpf_loop(env,
- i + delta,
- -(stack_depth + stack_depth_extra),
- inline_state->callback_subprogno,
- &cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- }
-
- if (subprogs[cur_subprog + 1].start == i + delta + 1) {
- subprogs[cur_subprog].stack_depth += stack_depth_extra;
- cur_subprog++;
- stack_depth = subprogs[cur_subprog].stack_depth;
- stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
- stack_depth_extra = 0;
- }
- }
-
- env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
-
- return 0;
-}
-
-/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
- * adjust subprograms stack depth when possible.
- */
-static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
-{
- struct bpf_subprog_info *subprog = env->subprog_info;
- struct bpf_insn_aux_data *aux = env->insn_aux_data;
- struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
- u32 spills_num;
- bool modified = false;
- int i, j;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- if (aux[i].fastcall_spills_num > 0) {
- spills_num = aux[i].fastcall_spills_num;
- /* NOPs would be removed by opt_remove_nops() */
- for (j = 1; j <= spills_num; ++j) {
- *(insn - j) = NOP;
- *(insn + j) = NOP;
- }
- modified = true;
- }
- if ((subprog + 1)->start == i + 1) {
- if (modified && !subprog->keep_fastcall_stack)
- subprog->stack_depth = -subprog->fastcall_stack_off;
- subprog++;
- modified = false;
- }
- }
-
- return 0;
-}
static void free_states(struct bpf_verifier_env *env)
{
@@ -24428,13 +18635,13 @@ static void free_states(struct bpf_verifier_env *env)
struct bpf_scc_info *info;
int i, j;
- free_verifier_state(env->cur_state, true);
+ bpf_free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
while (!pop_stack(env, NULL, NULL, false));
list_for_each_safe(pos, tmp, &env->free_list) {
sl = container_of(pos, struct bpf_verifier_state_list, node);
- free_verifier_state(&sl->state, false);
+ bpf_free_verifier_state(&sl->state, false);
kfree(sl);
}
INIT_LIST_HEAD(&env->free_list);
@@ -24444,7 +18651,7 @@ static void free_states(struct bpf_verifier_env *env)
if (!info)
continue;
for (j = 0; j < info->num_visits; j++)
- free_backedges(&info->visits[j]);
+ bpf_free_backedges(&info->visits[j]);
kvfree(info);
env->scc_info[i] = NULL;
}
@@ -24457,7 +18664,7 @@ static void free_states(struct bpf_verifier_env *env)
list_for_each_safe(pos, tmp, head) {
sl = container_of(pos, struct bpf_verifier_state_list, node);
- free_verifier_state(&sl->state, false);
+ bpf_free_verifier_state(&sl->state, false);
kfree(sl);
}
INIT_LIST_HEAD(&env->explored_states[i]);
@@ -24510,10 +18717,18 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
if (subprog_is_exc_cb(env, subprog)) {
state->frame[0]->in_exception_callback_fn = true;
- /* We have already ensured that the callback returns an integer, just
- * like all global subprogs. We need to determine it only has a single
- * scalar argument.
+
+ /*
+ * Global functions are scalar or void, make sure
+ * we return a scalar.
*/
+ if (subprog_returns_void(env, subprog)) {
+ verbose(env, "exception cb cannot return void\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Also ensure the callback only has a single scalar argument. */
if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
verbose(env, "exception cb only supports single integer argument\n");
ret = -EINVAL;
@@ -24630,7 +18845,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
again:
new_cnt = 0;
for (i = 1; i < env->subprog_cnt; i++) {
- if (!subprog_is_global(env, i))
+ if (!bpf_subprog_is_global(env, i))
continue;
sub_aux = subprog_aux(env, i);
@@ -24792,7 +19007,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) {
- if (st_ops_desc->arg_info[member_idx].info->refcounted) {
+ if (st_ops_desc->arg_info[member_idx].info[i].refcounted) {
has_refcounted_arg = true;
break;
}
@@ -24820,14 +19035,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
#define SECURITY_PREFIX "security_"
-static int check_attach_modify_return(unsigned long addr, const char *func_name)
-{
- if (within_error_injection_list(addr) ||
- !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
- return 0;
-
- return -EINVAL;
-}
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
/* list of non-sleepable functions that are otherwise on
* ALLOW_ERROR_INJECTION list
@@ -24850,6 +19058,75 @@ static int check_non_sleepable_error_inject(u32 btf_id)
return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
}
+static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name)
+{
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
+ * attached to ALLOW_ERROR_INJECTION and are not in denylist.
+ */
+ if (!check_non_sleepable_error_inject(btf_id) &&
+ within_error_injection_list(addr))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int check_attach_modify_return(unsigned long addr, const char *func_name)
+{
+ if (within_error_injection_list(addr) ||
+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
+ return 0;
+
+ return -EINVAL;
+}
+
+#else
+
+/* Unfortunately, the arch-specific prefixes are hard-coded in arch syscall code
+ * so we need to hard-code them, too. Ftrace has arch_syscall_match_sym_name()
+ * but that just compares two concrete function names.
+ */
+static bool has_arch_syscall_prefix(const char *func_name)
+{
+#if defined(__x86_64__)
+ return !strncmp(func_name, "__x64_", 6);
+#elif defined(__i386__)
+ return !strncmp(func_name, "__ia32_", 7);
+#elif defined(__s390x__)
+ return !strncmp(func_name, "__s390x_", 8);
+#elif defined(__aarch64__)
+ return !strncmp(func_name, "__arm64_", 8);
+#elif defined(__riscv)
+ return !strncmp(func_name, "__riscv_", 8);
+#elif defined(__powerpc__) || defined(__powerpc64__)
+ return !strncmp(func_name, "sys_", 4);
+#elif defined(__loongarch__)
+ return !strncmp(func_name, "sys_", 4);
+#else
+ return false;
+#endif
+}
+
+/* Without error injection, allow sleepable and fmod_ret progs on syscalls. */
+
+static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name)
+{
+ if (has_arch_syscall_prefix(func_name))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int check_attach_modify_return(unsigned long addr, const char *func_name)
+{
+ if (has_arch_syscall_prefix(func_name) ||
+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
+ return 0;
+
+ return -EINVAL;
+}
+
+#endif /* CONFIG_FUNCTION_ERROR_INJECTION */
+
int bpf_check_attach_target(struct bpf_verifier_log *log,
const struct bpf_prog *prog,
const struct bpf_prog *tgt_prog,
@@ -24876,7 +19153,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
if (!btf) {
bpf_log(log,
- "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
+ "Tracing program can only be attached to another program annotated with BTF\n");
return -EINVAL;
}
t = btf_type_by_id(btf, btf_id);
@@ -24912,7 +19189,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (aux->func && aux->func[subprog]->aux->exception_cb) {
bpf_log(log,
"%s programs cannot attach to exception callback\n",
- prog_extension ? "Extension" : "FENTRY/FEXIT");
+ prog_extension ? "Extension" : "Tracing");
return -EINVAL;
}
conservative = aux->func_info_aux[subprog].unreliable;
@@ -25001,7 +19278,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
case BPF_TRACE_RAW_TP:
if (tgt_prog) {
bpf_log(log,
- "Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
+ "Only FENTRY/FEXIT/FSESSION progs are attachable to another BPF prog\n");
return -EINVAL;
}
if (!btf_type_is_typedef(t)) {
@@ -25129,12 +19406,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
-
- /* fentry/fexit/fmod_ret progs can be sleepable if they are
- * attached to ALLOW_ERROR_INJECTION and are not in denylist.
- */
- if (!check_non_sleepable_error_inject(btf_id) &&
- within_error_injection_list(addr))
+ if (!check_attach_sleepable(btf_id, addr, tname))
ret = 0;
/* fentry/fexit/fmod_ret progs can also be sleepable if they are
* in the fmodret id set with the KF_SLEEPABLE flag.
@@ -25231,7 +19503,6 @@ BTF_ID(func, __x64_sys_exit_group)
BTF_ID(func, do_exit)
BTF_ID(func, do_group_exit)
BTF_ID(func, kthread_complete_and_exit)
-BTF_ID(func, kthread_exit)
BTF_ID(func, make_task_dead)
BTF_SET_END(noreturn_deny)
@@ -25273,7 +19544,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
if (prog->sleepable && !can_be_sleepable(prog)) {
- verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+ verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -25421,430 +19692,209 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr,
return 0;
}
-/* Each field is a register bitmask */
-struct insn_live_regs {
- u16 use; /* registers read by instruction */
- u16 def; /* registers written by instruction */
- u16 in; /* registers that may be alive before instruction */
- u16 out; /* registers that may be alive after instruction */
-};
+/* replace a generic kfunc with a specialized version if necessary */
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
+{
+ struct bpf_prog *prog = env->prog;
+ bool seen_direct_write;
+ void *xdp_kfunc;
+ bool is_rdonly;
+ u32 func_id = desc->func_id;
+ u16 offset = desc->offset;
+ unsigned long addr = desc->addr;
-/* Bitmask with 1s for all caller saved registers */
-#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+ if (offset) /* return if module BTF is used */
+ return 0;
-/* Compute info->{use,def} fields for the instruction */
-static void compute_insn_live_regs(struct bpf_verifier_env *env,
- struct bpf_insn *insn,
- struct insn_live_regs *info)
-{
- struct call_summary cs;
- u8 class = BPF_CLASS(insn->code);
- u8 code = BPF_OP(insn->code);
- u8 mode = BPF_MODE(insn->code);
- u16 src = BIT(insn->src_reg);
- u16 dst = BIT(insn->dst_reg);
- u16 r0 = BIT(0);
- u16 def = 0;
- u16 use = 0xffff;
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+ if (xdp_kfunc)
+ addr = (unsigned long)xdp_kfunc;
+ /* fallback to default kfunc when not supported by netdev */
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ seen_direct_write = env->seen_direct_write;
+ is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
- switch (class) {
- case BPF_LD:
- switch (mode) {
- case BPF_IMM:
- if (BPF_SIZE(insn->code) == BPF_DW) {
- def = dst;
- use = 0;
- }
- break;
- case BPF_LD | BPF_ABS:
- case BPF_LD | BPF_IND:
- /* stick with defaults */
- break;
- }
- break;
- case BPF_LDX:
- switch (mode) {
- case BPF_MEM:
- case BPF_MEMSX:
- def = dst;
- use = src;
- break;
- }
- break;
- case BPF_ST:
- switch (mode) {
- case BPF_MEM:
- def = 0;
- use = dst;
- break;
- }
- break;
- case BPF_STX:
- switch (mode) {
- case BPF_MEM:
- def = 0;
- use = dst | src;
- break;
- case BPF_ATOMIC:
- switch (insn->imm) {
- case BPF_CMPXCHG:
- use = r0 | dst | src;
- def = r0;
- break;
- case BPF_LOAD_ACQ:
- def = dst;
- use = src;
- break;
- case BPF_STORE_REL:
- def = 0;
- use = dst | src;
- break;
- default:
- use = dst | src;
- if (insn->imm & BPF_FETCH)
- def = src;
- else
- def = 0;
- }
- break;
- }
- break;
- case BPF_ALU:
- case BPF_ALU64:
- switch (code) {
- case BPF_END:
- use = dst;
- def = dst;
- break;
- case BPF_MOV:
- def = dst;
- if (BPF_SRC(insn->code) == BPF_K)
- use = 0;
- else
- use = src;
- break;
- default:
- def = dst;
- if (BPF_SRC(insn->code) == BPF_K)
- use = dst;
- else
- use = dst | src;
- }
- break;
- case BPF_JMP:
- case BPF_JMP32:
- switch (code) {
- case BPF_JA:
- def = 0;
- if (BPF_SRC(insn->code) == BPF_X)
- use = dst;
- else
- use = 0;
- break;
- case BPF_JCOND:
- def = 0;
- use = 0;
- break;
- case BPF_EXIT:
- def = 0;
- use = r0;
- break;
- case BPF_CALL:
- def = ALL_CALLER_SAVED_REGS;
- use = def & ~BIT(BPF_REG_0);
- if (get_call_summary(env, insn, &cs))
- use = GENMASK(cs.num_params, 1);
- break;
- default:
- def = 0;
- if (BPF_SRC(insn->code) == BPF_K)
- use = dst;
- else
- use = dst | src;
- }
- break;
+ if (is_rdonly)
+ addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
+ } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_set_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ if (!env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_dynptr_from_file_sleepable;
+ } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) {
+ if (env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable;
+ } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) {
+ if (env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_arena_free_pages_non_sleepable;
}
+ desc->addr = addr;
+ return 0;
+}
- info->def = def;
- info->use = use;
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ u16 node_offset_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+ insn_buf[3] = *insn;
+ *cnt = 4;
}
-/* Compute may-live registers after each instruction in the program.
- * The register is live after the instruction I if it is read by some
- * instruction S following I during program execution and is not
- * overwritten between I and S.
- *
- * Store result in env->insn_aux_data[i].live_regs.
- */
-static int compute_live_registers(struct bpf_verifier_env *env)
+int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
- struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
- struct bpf_insn *insns = env->prog->insnsi;
- struct insn_live_regs *state;
- int insn_cnt = env->prog->len;
- int err = 0, i, j;
- bool changed;
-
- /* Use the following algorithm:
- * - define the following:
- * - I.use : a set of all registers read by instruction I;
- * - I.def : a set of all registers written by instruction I;
- * - I.in : a set of all registers that may be alive before I execution;
- * - I.out : a set of all registers that may be alive after I execution;
- * - insn_successors(I): a set of instructions S that might immediately
- * follow I for some program execution;
- * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
- * - visit each instruction in a postorder and update
- * state[i].in, state[i].out as follows:
- *
- * state[i].out = U [state[s].in for S in insn_successors(i)]
- * state[i].in = (state[i].out / state[i].def) U state[i].use
- *
- * (where U stands for set union, / stands for set difference)
- * - repeat the computation while {in,out} fields changes for
- * any instruction.
+ struct bpf_kfunc_desc *desc;
+ int err;
+
+ if (!insn->imm) {
+ verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
+ return -EINVAL;
+ }
+
+ *cnt = 0;
+
+ /* insn->imm has the btf func_id. Replace it with an offset relative to
+ * __bpf_call_base, unless the JIT needs to call functions that are
+ * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
*/
- state = kvzalloc_objs(*state, insn_cnt, GFP_KERNEL_ACCOUNT);
- if (!state) {
- err = -ENOMEM;
- goto out;
+ desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
+ if (!desc) {
+ verifier_bug(env, "kernel function descriptor not found for func_id %u",
+ insn->imm);
+ return -EFAULT;
}
- for (i = 0; i < insn_cnt; ++i)
- compute_insn_live_regs(env, &insns[i], &state[i]);
-
- changed = true;
- while (changed) {
- changed = false;
- for (i = 0; i < env->cfg.cur_postorder; ++i) {
- int insn_idx = env->cfg.insn_postorder[i];
- struct insn_live_regs *live = &state[insn_idx];
- struct bpf_iarray *succ;
- u16 new_out = 0;
- u16 new_in = 0;
-
- succ = bpf_insn_successors(env, insn_idx);
- for (int s = 0; s < succ->cnt; ++s)
- new_out |= state[succ->items[s]].in;
- new_in = (new_out & ~live->def) | live->use;
- if (new_out != live->out || new_in != live->in) {
- live->in = new_in;
- live->out = new_out;
- changed = true;
- }
+ err = specialize_kfunc(env, desc, insn_idx);
+ if (err)
+ return err;
+
+ if (!bpf_jit_supports_far_kfunc_call())
+ insn->imm = BPF_CALL_IMM(desc->addr);
+
+ if (is_bpf_obj_new_kfunc(desc->func_id) || is_bpf_percpu_obj_new_kfunc(desc->func_id)) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ if (is_bpf_percpu_obj_new_kfunc(desc->func_id) && kptr_struct_meta) {
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
}
- }
- for (i = 0; i < insn_cnt; ++i)
- insn_aux[i].live_regs_before = state[i].in;
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (is_bpf_obj_drop_kfunc(desc->func_id) ||
+ is_bpf_percpu_obj_drop_kfunc(desc->func_id) ||
+ is_bpf_refcount_acquire_kfunc(desc->func_id)) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "Live regs before insn:\n");
- for (i = 0; i < insn_cnt; ++i) {
- if (env->insn_aux_data[i].scc)
- verbose(env, "%3d ", env->insn_aux_data[i].scc);
- else
- verbose(env, " ");
- verbose(env, "%3d: ", i);
- for (j = BPF_REG_0; j < BPF_REG_10; ++j)
- if (insn_aux[i].live_regs_before & BIT(j))
- verbose(env, "%d", j);
- else
- verbose(env, ".");
- verbose(env, " ");
- verbose_insn(env, &insns[i]);
- if (bpf_is_ldimm64(&insns[i]))
- i++;
+ if (is_bpf_percpu_obj_drop_kfunc(desc->func_id) && kptr_struct_meta) {
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
}
- }
-out:
- kvfree(state);
- return err;
-}
+ if (is_bpf_refcount_acquire_kfunc(desc->func_id) && !kptr_struct_meta) {
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
-/*
- * Compute strongly connected components (SCCs) on the CFG.
- * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
- * If instruction is a sole member of its SCC and there are no self edges,
- * assign it SCC number of zero.
- * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
- */
-static int compute_scc(struct bpf_verifier_env *env)
-{
- const u32 NOT_ON_STACK = U32_MAX;
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (is_bpf_list_push_kfunc(desc->func_id) ||
+ is_bpf_rbtree_add_kfunc(desc->func_id)) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ int struct_meta_reg = BPF_REG_3;
+ int node_offset_reg = BPF_REG_4;
- struct bpf_insn_aux_data *aux = env->insn_aux_data;
- const u32 insn_cnt = env->prog->len;
- int stack_sz, dfs_sz, err = 0;
- u32 *stack, *pre, *low, *dfs;
- u32 i, j, t, w;
- u32 next_preorder_num;
- u32 next_scc_id;
- bool assign_scc;
- struct bpf_iarray *succ;
-
- next_preorder_num = 1;
- next_scc_id = 1;
- /*
- * - 'stack' accumulates vertices in DFS order, see invariant comment below;
- * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
- * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
- * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
- */
- stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
- pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
- low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
- dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT);
- if (!stack || !pre || !low || !dfs) {
- err = -ENOMEM;
- goto exit;
+ /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+ if (is_bpf_rbtree_add_kfunc(desc->func_id)) {
+ struct_meta_reg = BPF_REG_4;
+ node_offset_reg = BPF_REG_5;
+ }
+
+ if (!kptr_struct_meta) {
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ node_offset_reg, insn, insn_buf, cnt);
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ /*
+ * inline the bpf_session_is_return() for fsession:
+ * bool bpf_session_is_return(void *ctx)
+ * {
+ * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1;
+ * }
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ /*
+ * inline bpf_session_cookie() for fsession:
+ * __u64 *bpf_session_cookie(void *ctx)
+ * {
+ * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF;
+ * return &((u64 *)ctx)[-off];
+ * }
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1);
+ insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0);
+ *cnt = 6;
}
- /*
- * References:
- * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
- * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
- *
- * The algorithm maintains the following invariant:
- * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
- * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
- *
- * Consequently:
- * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
- * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
- * and thus there is an SCC (loop) containing both 'u' and 'v'.
- * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
- * and 'v' can be considered the root of some SCC.
- *
- * Here is a pseudo-code for an explicitly recursive version of the algorithm:
- *
- * NOT_ON_STACK = insn_cnt + 1
- * pre = [0] * insn_cnt
- * low = [0] * insn_cnt
- * scc = [0] * insn_cnt
- * stack = []
- *
- * next_preorder_num = 1
- * next_scc_id = 1
- *
- * def recur(w):
- * nonlocal next_preorder_num
- * nonlocal next_scc_id
- *
- * pre[w] = next_preorder_num
- * low[w] = next_preorder_num
- * next_preorder_num += 1
- * stack.append(w)
- * for s in successors(w):
- * # Note: for classic algorithm the block below should look as:
- * #
- * # if pre[s] == 0:
- * # recur(s)
- * # low[w] = min(low[w], low[s])
- * # elif low[s] != NOT_ON_STACK:
- * # low[w] = min(low[w], pre[s])
- * #
- * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
- * # does not break the invariant and makes itartive version of the algorithm
- * # simpler. See 'Algorithm #3' from [2].
- *
- * # 's' not yet visited
- * if pre[s] == 0:
- * recur(s)
- * # if 's' is on stack, pick lowest reachable preorder number from it;
- * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
- * # so 'min' would be a noop.
- * low[w] = min(low[w], low[s])
- *
- * if low[w] == pre[w]:
- * # 'w' is the root of an SCC, pop all vertices
- * # below 'w' on stack and assign same SCC to them.
- * while True:
- * t = stack.pop()
- * low[t] = NOT_ON_STACK
- * scc[t] = next_scc_id
- * if t == w:
- * break
- * next_scc_id += 1
- *
- * for i in range(0, insn_cnt):
- * if pre[i] == 0:
- * recur(i)
- *
- * Below implementation replaces explicit recursion with array 'dfs'.
- */
- for (i = 0; i < insn_cnt; i++) {
- if (pre[i])
- continue;
- stack_sz = 0;
- dfs_sz = 1;
- dfs[0] = i;
-dfs_continue:
- while (dfs_sz) {
- w = dfs[dfs_sz - 1];
- if (pre[w] == 0) {
- low[w] = next_preorder_num;
- pre[w] = next_preorder_num;
- next_preorder_num++;
- stack[stack_sz++] = w;
- }
- /* Visit 'w' successors */
- succ = bpf_insn_successors(env, w);
- for (j = 0; j < succ->cnt; ++j) {
- if (pre[succ->items[j]]) {
- low[w] = min(low[w], low[succ->items[j]]);
- } else {
- dfs[dfs_sz++] = succ->items[j];
- goto dfs_continue;
- }
- }
- /*
- * Preserve the invariant: if some vertex above in the stack
- * is reachable from 'w', keep 'w' on the stack.
- */
- if (low[w] < pre[w]) {
- dfs_sz--;
- goto dfs_continue;
- }
- /*
- * Assign SCC number only if component has two or more elements,
- * or if component has a self reference, or if instruction is a
- * callback calling function (implicit loop).
- */
- assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */
- for (j = 0; j < succ->cnt; ++j) { /* self reference? */
- if (succ->items[j] == w) {
- assign_scc = true;
- break;
- }
- }
- if (bpf_calls_callback(env, w)) /* implicit loop? */
- assign_scc = true;
- /* Pop component elements from stack */
- do {
- t = stack[--stack_sz];
- low[t] = NOT_ON_STACK;
- if (assign_scc)
- aux[t].scc = next_scc_id;
- } while (t != w);
- if (assign_scc)
- next_scc_id++;
- dfs_sz--;
- }
- }
- env->scc_info = kvzalloc_objs(*env->scc_info, next_scc_id,
- GFP_KERNEL_ACCOUNT);
- if (!env->scc_info) {
- err = -ENOMEM;
- goto exit;
- }
- env->scc_cnt = next_scc_id;
-exit:
- kvfree(stack);
- kvfree(pre);
- kvfree(low);
- kvfree(dfs);
- return err;
+
+ if (env->insn_aux_data[insn_idx].arg_prog) {
+ u32 regno = env->insn_aux_data[insn_idx].arg_prog;
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
+ int idx = *cnt;
+
+ insn_buf[idx++] = ld_addrs[0];
+ insn_buf[idx++] = ld_addrs[1];
+ insn_buf[idx++] = *insn;
+ *cnt = idx;
+ }
+ return 0;
}
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
@@ -25878,7 +19928,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
goto err_free_env;
for (i = 0; i < len; i++)
env->insn_aux_data[i].orig_idx = i;
- env->succ = iarray_realloc(NULL, 2);
+ env->succ = bpf_iarray_realloc(NULL, 2);
if (!env->succ)
goto err_free_env;
env->prog = *prog;
@@ -25939,7 +19989,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
INIT_LIST_HEAD(&env->explored_states[i]);
INIT_LIST_HEAD(&env->free_list);
- ret = check_btf_info_early(env, attr, uattr);
+ ret = bpf_check_btf_info_early(env, attr, uattr);
if (ret < 0)
goto skip_full_check;
@@ -25951,11 +20001,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
- ret = check_btf_info(env, attr, uattr);
+ ret = bpf_check_btf_info(env, attr, uattr);
if (ret < 0)
goto skip_full_check;
- ret = resolve_pseudo_ldimm64(env);
+ ret = check_and_resolve_insns(env);
if (ret < 0)
goto skip_full_check;
@@ -25965,11 +20015,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
goto skip_full_check;
}
- ret = check_cfg(env);
+ ret = bpf_check_cfg(env);
if (ret < 0)
goto skip_full_check;
- ret = compute_postorder(env);
+ ret = bpf_compute_postorder(env);
if (ret < 0)
goto skip_full_check;
@@ -25981,11 +20031,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret)
goto skip_full_check;
- ret = compute_scc(env);
+ ret = bpf_compute_const_regs(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = bpf_prune_dead_branches(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = sort_subprogs_topo(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = bpf_compute_scc(env);
if (ret < 0)
goto skip_full_check;
- ret = compute_live_registers(env);
+ ret = bpf_compute_live_registers(env);
if (ret < 0)
goto skip_full_check;
@@ -26006,22 +20068,22 @@ skip_full_check:
* allocate additional slots.
*/
if (ret == 0)
- ret = remove_fastcall_spills_fills(env);
+ ret = bpf_remove_fastcall_spills_fills(env);
if (ret == 0)
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
if (ret == 0)
- ret = optimize_bpf_loop(env);
+ ret = bpf_optimize_bpf_loop(env);
if (is_priv) {
if (ret == 0)
- opt_hard_wire_dead_code_branches(env);
+ bpf_opt_hard_wire_dead_code_branches(env);
if (ret == 0)
- ret = opt_remove_dead_code(env);
+ ret = bpf_opt_remove_dead_code(env);
if (ret == 0)
- ret = opt_remove_nops(env);
+ ret = bpf_opt_remove_nops(env);
} else {
if (ret == 0)
sanitize_dead_code(env);
@@ -26029,22 +20091,22 @@ skip_full_check:
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
- ret = convert_ctx_accesses(env);
+ ret = bpf_convert_ctx_accesses(env);
if (ret == 0)
- ret = do_misc_fixups(env);
+ ret = bpf_do_misc_fixups(env);
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
- ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
+ ret = bpf_opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
}
if (ret == 0)
- ret = fixup_call_args(env);
+ ret = bpf_fixup_call_args(env);
env->verification_time = ktime_get_ns() - start_time;
print_verification_stats(env);
@@ -26103,6 +20165,14 @@ skip_full_check:
adjust_btf_func(env);
+ /* extension progs temporarily inherit the attach_type of their targets
+ for verification purposes, so set it back to zero before returning
+ */
+ if (env->prog->type == BPF_PROG_TYPE_EXT)
+ env->prog->expected_attach_type = 0;
+
+ env->prog = __bpf_prog_select_runtime(env, env->prog, &ret);
+
err_release_maps:
if (ret)
release_insn_arrays(env);
@@ -26114,19 +20184,13 @@ err_release_maps:
if (!env->prog->aux->used_btfs)
release_btfs(env);
- /* extension progs temporarily inherit the attach_type of their targets
- for verification purposes, so set it back to zero before returning
- */
- if (env->prog->type == BPF_PROG_TYPE_EXT)
- env->prog->expected_attach_type = 0;
-
*prog = env->prog;
module_put(env->attach_btf_mod);
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
- clear_insn_aux_data(env, 0, env->prog->len);
+ bpf_clear_insn_aux_data(env, 0, env->prog->len);
vfree(env->insn_aux_data);
err_free_env:
bpf_stack_liveness_free(env);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 3bfe37693d68..58797123b752 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -184,11 +184,6 @@ extern bool cgrp_dfl_visible;
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
static inline bool notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -222,7 +217,6 @@ static inline void get_css_set(struct css_set *cset)
}
bool cgroup_ssid_enabled(int ssid);
-bool cgroup_on_dfl(const struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c22cda7766d8..1f084ee71443 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -69,14 +69,6 @@
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
- * To avoid confusing the compiler (and generating warnings) with code
- * that attempts to access what would be a 0-element array (i.e. sized
- * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
- * constant expression can be added.
- */
-#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
-
-/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
@@ -107,12 +99,6 @@ static bool cgroup_debug __read_mostly;
*/
static DEFINE_SPINLOCK(cgroup_idr_lock);
-/*
- * Protects cgroup_file->kn for !self csses. It synchronizes notifications
- * against file removal/re-creation across css hiding.
- */
-static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-
DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
@@ -510,27 +496,6 @@ static u32 cgroup_ss_mask(struct cgroup *cgrp)
}
/**
- * cgroup_css - obtain a cgroup's css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns @cgrp->self)
- *
- * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
- * function must be called either under cgroup_mutex or rcu_read_lock() and
- * the caller is responsible for pinning the returned css if it wants to
- * keep accessing it outside the said locks. This function may return
- * %NULL if @cgrp doesn't have @subsys_id enabled.
- */
-static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- if (CGROUP_HAS_SUBSYS_CONFIG && ss)
- return rcu_dereference_check(cgrp->subsys[ss->id],
- lockdep_is_held(&cgroup_mutex));
- else
- return &cgrp->self;
-}
-
-/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -741,32 +706,6 @@ EXPORT_SYMBOL_GPL(of_css);
} \
} while (false)
-/* iterate over child cgrps, lock should be held throughout iteration */
-#define cgroup_for_each_live_child(child, cgrp) \
- list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- cgroup_is_dead(child); })) \
- ; \
- else
-
-/* walk live descendants in pre order */
-#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
- css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- (dsct) = (d_css)->cgroup; \
- cgroup_is_dead(dsct); })) \
- ; \
- else
-
-/* walk live descendants in postorder */
-#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
- css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- (dsct) = (d_css)->cgroup; \
- cgroup_is_dead(dsct); })) \
- ; \
- else
-
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
@@ -1748,9 +1687,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
struct cgroup_file *cfile = (void *)css + cft->file_offset;
- spin_lock_irq(&cgroup_file_kn_lock);
- cfile->kn = NULL;
- spin_unlock_irq(&cgroup_file_kn_lock);
+ spin_lock_irq(&cfile->lock);
+ WRITE_ONCE(cfile->kn, NULL);
+ spin_unlock_irq(&cfile->lock);
timer_delete_sync(&cfile->notify_timer);
}
@@ -2126,6 +2065,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
+ init_waitqueue_head(&cgrp->dying_populated_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -2608,6 +2548,7 @@ static void cgroup_migrate_add_task(struct task_struct *task,
mgctx->tset.nr_tasks++;
+ css_set_skip_task_iters(cset, task);
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
@@ -4427,10 +4368,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
struct cgroup_file *cfile = (void *)css + cft->file_offset;
timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
-
- spin_lock_irq(&cgroup_file_kn_lock);
+ spin_lock_init(&cfile->lock);
cfile->kn = kn;
- spin_unlock_irq(&cgroup_file_kn_lock);
}
return 0;
@@ -4685,21 +4624,32 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
*/
void cgroup_file_notify(struct cgroup_file *cfile)
{
- unsigned long flags;
+ unsigned long flags, last, next;
+ struct kernfs_node *kn = NULL;
+
+ if (!READ_ONCE(cfile->kn))
+ return;
- spin_lock_irqsave(&cgroup_file_kn_lock, flags);
+ last = READ_ONCE(cfile->notified_at);
+ next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+ if (time_in_range(jiffies, last, next)) {
+ timer_reduce(&cfile->notify_timer, next);
+ if (timer_pending(&cfile->notify_timer))
+ return;
+ }
+
+ spin_lock_irqsave(&cfile->lock, flags);
if (cfile->kn) {
- unsigned long last = cfile->notified_at;
- unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+ kn = cfile->kn;
+ kernfs_get(kn);
+ WRITE_ONCE(cfile->notified_at, jiffies);
+ }
+ spin_unlock_irqrestore(&cfile->lock, flags);
- if (time_in_range(jiffies, last, next)) {
- timer_reduce(&cfile->notify_timer, next);
- } else {
- kernfs_notify(cfile->kn);
- cfile->notified_at = jiffies;
- }
+ if (kn) {
+ kernfs_notify(kn);
+ kernfs_put(kn);
}
- spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
EXPORT_SYMBOL_GPL(cgroup_file_notify);
@@ -4712,10 +4662,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
{
struct kernfs_node *kn;
- spin_lock_irq(&cgroup_file_kn_lock);
+ spin_lock_irq(&cfile->lock);
kn = cfile->kn;
kernfs_get(kn);
- spin_unlock_irq(&cgroup_file_kn_lock);
+ spin_unlock_irq(&cfile->lock);
if (kn)
kernfs_show(kn, show);
@@ -5108,6 +5058,12 @@ repeat:
return;
task = list_entry(it->task_pos, struct task_struct, cg_list);
+ /*
+ * Hide tasks that are exiting but not yet removed. Keep zombie
+ * leaders with live threads visible.
+ */
+ if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
+ goto repeat;
if (it->flags & CSS_TASK_ITER_PROCS) {
/* if PROCS, skip over tasks which aren't group leaders */
@@ -6217,6 +6173,78 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return 0;
};
+/**
+ * cgroup_drain_dying - wait for dying tasks to leave before rmdir
+ * @cgrp: the cgroup being removed
+ *
+ * cgroup.procs and cgroup.threads use css_task_iter which filters out
+ * PF_EXITING tasks so that userspace doesn't see tasks that have already been
+ * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
+ * cgroup has non-empty css_sets - is only updated when dying tasks pass through
+ * cgroup_task_dead() in finish_task_switch(). This creates a window where
+ * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
+ * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
+ * tasks.
+ *
+ * This function aligns cgroup_has_tasks() with what userspace can observe. If
+ * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
+ * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
+ * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
+ *
+ * This function only concerns itself with this cgroup's own dying tasks.
+ * Whether the cgroup has children is cgroup_destroy_locked()'s problem.
+ *
+ * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
+ * retry the full check from scratch.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+static int cgroup_drain_dying(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+ DEFINE_WAIT(wait);
+
+ lockdep_assert_held(&cgroup_mutex);
+retry:
+ if (!cgroup_has_tasks(cgrp))
+ return 0;
+
+ /* Same iterator as cgroup.threads - if any task is visible, it's busy */
+ css_task_iter_start(&cgrp->self, 0, &it);
+ task = css_task_iter_next(&it);
+ css_task_iter_end(&it);
+
+ if (task)
+ return -EBUSY;
+
+ /*
+ * All remaining tasks are PF_EXITING and will pass through
+ * cgroup_task_dead() shortly. Wait for a kick and retry.
+ *
+ * cgroup_has_tasks() can't transition from false to true while we're
+ * holding cgroup_mutex, but the true to false transition happens
+ * under css_set_lock (via cgroup_task_dead()). We must retest and
+ * prepare_to_wait() under css_set_lock. Otherwise, the transition
+ * can happen between our first test and prepare_to_wait(), and we
+ * sleep with no one to wake us.
+ */
+ spin_lock_irq(&css_set_lock);
+ if (!cgroup_has_tasks(cgrp)) {
+ spin_unlock_irq(&css_set_lock);
+ return 0;
+ }
+ prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&css_set_lock);
+ mutex_unlock(&cgroup_mutex);
+ schedule();
+ finish_wait(&cgrp->dying_populated_waitq, &wait);
+ mutex_lock(&cgroup_mutex);
+ goto retry;
+}
+
int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
@@ -6226,9 +6254,12 @@ int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;
- ret = cgroup_destroy_locked(cgrp);
- if (!ret)
- TRACE_CGROUP_PATH(rmdir, cgrp);
+ ret = cgroup_drain_dying(cgrp);
+ if (!ret) {
+ ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ TRACE_CGROUP_PATH(rmdir, cgrp);
+ }
cgroup_kn_unlock(kn);
return ret;
@@ -6988,6 +7019,7 @@ void cgroup_task_exit(struct task_struct *tsk)
static void do_cgroup_task_dead(struct task_struct *tsk)
{
+ struct cgrp_cset_link *link;
struct css_set *cset;
unsigned long flags;
@@ -7001,6 +7033,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+ /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+ if (waitqueue_active(&link->cgrp->dying_populated_waitq))
+ wake_up(&link->cgrp->dying_populated_waitq);
+
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 9faf34377a88..1335e437098e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -62,6 +62,75 @@ static const char * const perr_strings[] = {
};
/*
+ * CPUSET Locking Convention
+ * -------------------------
+ *
+ * Below are the four global/local locks guarding cpuset structures in lock
+ * acquisition order:
+ * - cpuset_top_mutex
+ * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock)
+ * - cpuset_mutex
+ * - callback_lock (raw spinlock)
+ *
+ * As cpuset will now indirectly flush a number of different workqueues in
+ * housekeeping_update() to update housekeeping cpumasks when the set of
+ * isolated CPUs is going to be changed, it may be vulnerable to deadlock
+ * if we hold cpus_read_lock while calling into housekeeping_update().
+ *
+ * The first cpuset_top_mutex will be held except when calling into
+ * cpuset_handle_hotplug() from the CPU hotplug code where cpus_write_lock
+ * and cpuset_mutex will be held instead. The main purpose of this mutex
+ * is to prevent regular cpuset control file write actions from interfering
+ * with the call to housekeeping_update(), though CPU hotplug operation can
+ * still happen in parallel. This mutex also provides protection for some
+ * internal variables.
+ *
+ * A task must hold all the remaining three locks to modify externally visible
+ * or used fields of cpusets, though some of the internally used cpuset fields
+ * and internal variables can be modified without holding callback_lock. If only
+ * reliable read access of the externally used fields are needed, a task can
+ * hold either cpuset_mutex or callback_lock which are exposed to other
+ * external subsystems.
+ *
+ * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others,
+ * ensuring that it is the only task able to also acquire callback_lock and
+ * be able to modify cpusets. It can perform various checks on the cpuset
+ * structure first, knowing nothing will change. It can also allocate memory
+ * without holding callback_lock. While it is performing these checks, various
+ * callback routines can briefly acquire callback_lock to query cpusets. Once
+ * it is ready to make the changes, it takes callback_lock, blocking everyone
+ * else.
+ *
+ * Calls to the kernel memory allocator cannot be made while holding
+ * callback_lock which is a spinlock, as the memory allocator may sleep or
+ * call back into cpuset code and acquire callback_lock.
+ *
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
+ *
+ * The cpuset_common_seq_show() handlers only hold callback_lock across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ */
+
+static DEFINE_MUTEX(cpuset_top_mutex);
+static DEFINE_MUTEX(cpuset_mutex);
+
+/*
+ * File level internal variables below follow one of the following exclusion
+ * rules.
+ *
+ * RWCS: Read/write-able by holding either cpus_write_lock (and optionally
+ * cpuset_mutex) or both cpus_read_lock and cpuset_mutex.
+ *
+ * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable
+ * by holding both cpuset_mutex and callback_lock.
+ *
+ * T: Read/write-able by holding the cpuset_top_mutex.
+ */
+
+/*
* For local partitions, update to subpartitions_cpus & isolated_cpus is done
* in update_parent_effective_cpumask(). For remote partitions, it is done in
* the remote_partition_*() and remote_cpus_update() helpers.
@@ -70,19 +139,22 @@ static const char * const perr_strings[] = {
* Exclusive CPUs distributed out to local or remote sub-partitions of
* top_cpuset
*/
-static cpumask_var_t subpartitions_cpus;
+static cpumask_var_t subpartitions_cpus; /* RWCS */
/*
- * Exclusive CPUs in isolated partitions
+ * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated)
*/
-static cpumask_var_t isolated_cpus;
+static cpumask_var_t isolated_cpus; /* CSCB */
/*
- * isolated_cpus updating flag (protected by cpuset_mutex)
- * Set if isolated_cpus is going to be updated in the current
- * cpuset_mutex crtical section.
+ * Set if housekeeping cpumasks are to be updated.
*/
-static bool isolated_cpus_updating;
+static bool update_housekeeping; /* RWCS */
+
+/*
+ * Copy of isolated_cpus to be passed to housekeeping_update()
+ */
+static cpumask_var_t isolated_hk_cpus; /* T */
/*
* A flag to force sched domain rebuild at the end of an operation.
@@ -98,7 +170,7 @@ static bool isolated_cpus_updating;
* Note that update_relax_domain_level() in cpuset-v1.c can still call
* rebuild_sched_domains_locked() directly without using this flag.
*/
-static bool force_sd_rebuild;
+static bool force_sd_rebuild; /* RWCS */
/*
* Partition root states:
@@ -218,42 +290,6 @@ struct cpuset top_cpuset = {
.partition_root_state = PRS_ROOT,
};
-/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
- * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
- * structures. Note that cpuset_mutex needs to be a mutex as it is used in
- * paths that rely on priority inheritance (e.g. scheduler - on RT) for
- * correctness.
- *
- * A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, it blocks others, ensuring that it is the only task able to
- * also acquire callback_lock and be able to modify cpusets. It can perform
- * various checks on the cpuset structure first, knowing nothing will change.
- * It can also allocate memory while just holding cpuset_mutex. While it is
- * performing these checks, various callback routines can briefly acquire
- * callback_lock to query cpusets. Once it is ready to make the changes, it
- * takes callback_lock, blocking everyone else.
- *
- * Calls to the kernel memory allocator can not be made while holding
- * callback_lock, as that would risk double tripping on callback_lock
- * from one of the callbacks into the cpuset code from within
- * __alloc_pages().
- *
- * If a task is only holding callback_lock, then it has read-only
- * access to cpusets.
- *
- * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
- *
- * The cpuset_common_seq_show() handlers only hold callback_lock across
- * small pieces of code, such as when reading out possibly multi-word
- * cpumasks and nodemasks.
- */
-
-static DEFINE_MUTEX(cpuset_mutex);
-
/**
* cpuset_lock - Acquire the global cpuset mutex
*
@@ -283,6 +319,7 @@ void lockdep_assert_cpuset_lock_held(void)
*/
void cpuset_full_lock(void)
{
+ mutex_lock(&cpuset_top_mutex);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
}
@@ -291,12 +328,14 @@ void cpuset_full_unlock(void)
{
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
+ mutex_unlock(&cpuset_top_mutex);
}
#ifdef CONFIG_LOCKDEP
bool lockdep_is_cpuset_held(void)
{
- return lockdep_is_held(&cpuset_mutex);
+ return lockdep_is_held(&cpuset_mutex) ||
+ lockdep_is_held(&cpuset_top_mutex);
}
#endif
@@ -840,7 +879,7 @@ generate_doms:
/*
* Cgroup v2 doesn't support domain attributes, just set all of them
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
- * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ * subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs.
*/
for (i = 0; i < ndoms; i++) {
/*
@@ -849,7 +888,7 @@ generate_doms:
*/
if (!csa || csa[i] == &top_cpuset)
cpumask_and(doms[i], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
else
cpumask_copy(doms[i], csa[i]->effective_cpus);
if (dattr)
@@ -961,7 +1000,7 @@ void rebuild_sched_domains_locked(void)
* offline CPUs, a warning is emitted and we return directly to
* prevent the panic.
*/
- for (i = 0; i < ndoms; ++i) {
+ for (i = 0; doms && i < ndoms; i++) {
if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
return;
}
@@ -1161,12 +1200,18 @@ static void reset_partition_data(struct cpuset *cs)
static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs == new_prs);
- if (new_prs == PRS_ISOLATED)
+ lockdep_assert_held(&callback_lock);
+ lockdep_assert_held(&cpuset_mutex);
+ if (new_prs == PRS_ISOLATED) {
+ if (cpumask_subset(xcpus, isolated_cpus))
+ return;
cpumask_or(isolated_cpus, isolated_cpus, xcpus);
- else
+ } else {
+ if (!cpumask_intersects(xcpus, isolated_cpus))
+ return;
cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
-
- isolated_cpus_updating = true;
+ }
+ update_housekeeping = true;
}
/*
@@ -1219,8 +1264,8 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent,
isolated_cpus_update(old_prs, parent->partition_root_state,
xcpus);
- cpumask_and(xcpus, xcpus, cpu_active_mask);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+ cpumask_and(parent->effective_cpus, parent->effective_cpus, cpu_active_mask);
}
/*
@@ -1284,22 +1329,45 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
}
/*
- * update_isolation_cpumasks - Update external isolation related CPU masks
+ * cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock
*
- * The following external CPU masks will be updated if necessary:
- * - workqueue unbound cpumask
+ * Update housekeeping cpumasks and rebuild sched domains if necessary and
+ * then do a cpuset_full_unlock().
+ * This should be called at the end of cpuset operation.
*/
-static void update_isolation_cpumasks(void)
+static void cpuset_update_sd_hk_unlock(void)
+ __releases(&cpuset_mutex)
+ __releases(&cpuset_top_mutex)
{
- int ret;
+ /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
- if (!isolated_cpus_updating)
- return;
+ if (update_housekeeping) {
+ update_housekeeping = false;
+ cpumask_copy(isolated_hk_cpus, isolated_cpus);
- ret = housekeeping_update(isolated_cpus);
- WARN_ON_ONCE(ret < 0);
+ /*
+ * housekeeping_update() is now called without holding
+ * cpus_read_lock and cpuset_mutex. Only cpuset_top_mutex
+ * is still being held for mutual exclusion.
+ */
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+ WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus));
+ mutex_unlock(&cpuset_top_mutex);
+ } else {
+ cpuset_full_unlock();
+ }
+}
- isolated_cpus_updating = false;
+/*
+ * Work function to invoke cpuset_update_sd_hk_unlock()
+ */
+static void hk_sd_workfn(struct work_struct *work)
+{
+ cpuset_full_lock();
+ cpuset_update_sd_hk_unlock();
}
/**
@@ -1450,7 +1518,6 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
cs->remote_partition = true;
cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
cpuset_force_rebuild();
cs->prs_err = 0;
@@ -1495,7 +1562,6 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
cpuset_force_rebuild();
/*
@@ -1566,7 +1632,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
if (xcpus)
cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
if (adding || deleting)
cpuset_force_rebuild();
@@ -1910,7 +1975,6 @@ write_error:
partition_xcpus_add(new_prs, parent, tmp->delmask);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive_flag(cs, new_prs);
@@ -2155,7 +2219,7 @@ get_css:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
+ cpuset_update_tasks_cpumask(cp, tmp->new_cpus);
/*
* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
@@ -2878,7 +2942,6 @@ out:
else if (isolcpus_updated)
isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
/* Force update if switching back to member & update effective_xcpus */
update_cpumasks_hier(cs, &tmpmask, !new_prs);
@@ -2925,7 +2988,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs, *oldcs;
struct task_struct *task;
- bool cpus_updated, mems_updated;
+ bool setsched_check;
int ret;
/* used later by cpuset_attach() */
@@ -2940,20 +3003,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
if (ret)
goto out_unlock;
- cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
- mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+ /*
+ * Skip rights over task setsched check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ setsched_check = !cpuset_v2() ||
+ !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) ||
+ !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * A v1 cpuset with tasks will have no CPU left only when CPU hotplug
+ * brings the last online CPU offline as users are not allowed to empty
+ * cpuset.cpus when there are active tasks inside. When that happens,
+ * we should allow tasks to migrate out without security check to make
+ * sure they will be able to run after migration.
+ */
+ if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus))
+ setsched_check = false;
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task);
if (ret)
goto out_unlock;
- /*
- * Skip rights over task check in v2 when nothing changes,
- * migration permission derives from hierarchy ownership in
- * cgroup_procs_write_permission()).
- */
- if (!cpuset_v2() || (cpus_updated || mems_updated)) {
+ if (setsched_check) {
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
@@ -3168,10 +3242,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
}
free_cpuset(trialcs);
- if (force_sd_rebuild)
- rebuild_sched_domains_locked();
out_unlock:
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
if (of_cft(of)->private == FILE_MEMLIST)
schedule_flush_migrate_mm();
return retval ?: nbytes;
@@ -3278,7 +3350,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
return retval ?: nbytes;
}
@@ -3452,7 +3524,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
/* Reset valid partition back to member */
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3607,6 +3679,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_hk_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -3778,6 +3851,7 @@ unlock:
*/
static void cpuset_handle_hotplug(void)
{
+ static DECLARE_WORK(hk_sd_work, hk_sd_workfn);
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
@@ -3859,9 +3933,25 @@ static void cpuset_handle_hotplug(void)
rcu_read_unlock();
}
- /* rebuild sched domains if necessary */
+ /*
+ * rebuild_sched_domains() will always be called directly if needed
+ * to make sure that newly added or removed CPU will be reflected in
+ * the sched domains. However, if isolated partition invalidation
+ * or recreation is being done (update_housekeeping set), a work item
+ * will be queued to call housekeeping_update() to update the
+ * corresponding housekeeping cpumasks after some slight delay.
+ *
+ * We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that
+ * is still pending. Before the pending bit is cleared, the work data
+ * is copied out and work item dequeued. So it is possible to queue
+ * the work again before the hk_sd_workfn() is invoked to process the
+ * previously queued work. Since hk_sd_workfn() doesn't use the work
+ * item at all, this is not a problem.
+ */
if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
+ if (update_housekeeping)
+ queue_work(system_dfl_wq, &hk_sd_work);
free_tmpmasks(ptmp);
}
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 9d95824dc6fa..1ab1fb47f271 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -707,8 +707,7 @@ static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
return 0;
}
-static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
- u64 *new_limit)
+static int dmemcg_parse_limit(char *options, u64 *new_limit)
{
char *end;
@@ -762,7 +761,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
if (!region)
return -EINVAL;
- err = dmemcg_parse_limit(options, region, &new_limit);
+ err = dmemcg_parse_limit(options, &new_limit);
if (err < 0)
goto out_put;
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 09258eebb5c7..9967fb25c563 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -173,7 +173,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg,
* the system.
*/
if (unlikely(!rpool)) {
- pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+ pr_warn("Invalid device %p or rdma cgroup %p\n", device, cg);
return;
}
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 774702591d26..307c97ac5fa9 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -29,7 +29,6 @@ CONFIG_SECTION_MISMATCH_WARN_ONLY=y
# CONFIG_UBSAN_ALIGNMENT is not set
# CONFIG_UBSAN_DIV_ZERO is not set
# CONFIG_UBSAN_TRAP is not set
-# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_FS_ALLOW_ALL=y
CONFIG_DEBUG_IRQFLAGS=y
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 2c1a3791e410..4f21fc3b108b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -27,8 +27,6 @@
#include <asm/page.h>
#include <asm/sections.h>
-#include <crypto/sha1.h>
-
#include "kallsyms_internal.h"
#include "kexec_internal.h"
diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c
index 1f4067fbdb94..cb875ddb6ba6 100644
--- a/kernel/crash_dump_dm_crypt.c
+++ b/kernel/crash_dump_dm_crypt.c
@@ -6,6 +6,7 @@
#include <linux/cc_platform.h>
#include <linux/configfs.h>
#include <linux/module.h>
+#include <linux/sysfs.h>
#define KEY_NUM_MAX 128 /* maximum dm crypt keys */
#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */
@@ -115,7 +116,7 @@ static int restore_dm_crypt_keys_to_thread_keyring(void)
addr = dm_crypt_keys_addr;
dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr);
- if (key_count < 0 || key_count > KEY_NUM_MAX) {
+ if (key_count > KEY_NUM_MAX) {
kexec_dprintk("Failed to read the number of dm-crypt keys\n");
return -1;
}
@@ -139,7 +140,7 @@ static int restore_dm_crypt_keys_to_thread_keyring(void)
return 0;
}
-static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
+static int read_key_from_user_keyring(struct dm_crypt_key *dm_key)
{
const struct user_key_payload *ukp;
struct key *key;
@@ -168,8 +169,8 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
memcpy(dm_key->data, ukp->data, ukp->datalen);
dm_key->key_size = ukp->datalen;
- kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size,
- dm_key->key_desc, dm_key->data);
+ kexec_dprintk("Get dm crypt key (size=%u) %s\n", dm_key->key_size,
+ dm_key->key_desc);
out:
up_read(&key->sem);
@@ -189,7 +190,7 @@ static inline struct config_key *to_config_key(struct config_item *item)
static ssize_t config_key_description_show(struct config_item *item, char *page)
{
- return sprintf(page, "%s\n", to_config_key(item)->description);
+ return sysfs_emit(page, "%s\n", to_config_key(item)->description);
}
static ssize_t config_key_description_store(struct config_item *item,
@@ -265,7 +266,7 @@ static struct config_item *config_keys_make_item(struct config_group *group,
static ssize_t config_keys_count_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", key_count);
+ return sysfs_emit(page, "%d\n", key_count);
}
CONFIGFS_ATTR_RO(config_keys_, count);
@@ -274,7 +275,7 @@ static bool is_dm_key_reused;
static ssize_t config_keys_reuse_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", is_dm_key_reused);
+ return sysfs_emit(page, "%d\n", is_dm_key_reused);
}
static ssize_t config_keys_reuse_store(struct config_item *item,
@@ -321,7 +322,7 @@ static bool restore;
static ssize_t config_keys_restore_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", restore);
+ return sysfs_emit(page, "%d\n", restore);
}
static ssize_t config_keys_restore_store(struct config_item *item,
@@ -387,7 +388,7 @@ static int build_keys_header(void)
strscpy(keys_header->keys[i].key_desc, key->description,
KEY_DESC_MAX_LEN);
- r = read_key_from_user_keying(&keys_header->keys[i]);
+ r = read_key_from_user_keyring(&keys_header->keys[i]);
if (r != 0) {
kexec_dprintk("Failed to read key %s\n",
keys_header->keys[i].key_desc);
@@ -414,14 +415,16 @@ int crash_load_dm_crypt_keys(struct kimage *image)
if (key_count <= 0) {
kexec_dprintk("No dm-crypt keys\n");
- return -ENOENT;
+ return 0;
}
if (!is_dm_key_reused) {
image->dm_crypt_keys_addr = 0;
r = build_keys_header();
- if (r)
+ if (r) {
+ pr_err("Failed to build dm-crypt keys header, ret=%d\n", r);
return r;
+ }
}
kbuf.buffer = keys_header;
@@ -432,6 +435,7 @@ int crash_load_dm_crypt_keys(struct kimage *image)
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
r = kexec_add_buffer(&kbuf);
if (r) {
+ pr_err("Failed to call kexec_add_buffer, ret=%d\n", r);
kvfree((void *)kbuf.buffer);
return r;
}
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index 62e60e0223cf..eee37a11380c 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -20,8 +20,6 @@
#include <asm/page.h>
#include <asm/sections.h>
-#include <crypto/sha1.h>
-
#include "kallsyms_internal.h"
#include "kexec_internal.h"
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 159900736f25..bfef21b4a9ae 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -72,6 +72,9 @@ config ARCH_HAS_DMA_PREP_COHERENT
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
+config ARCH_HAS_BATCHED_DMA_SYNC
+ bool
+
#
# Select this option if the architecture assumes DMA devices are coherent
# by default.
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 1147497bc512..bcdc0f76d2e8 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -362,17 +362,11 @@ static void rmem_dma_device_release(struct reserved_mem *rmem,
dev->dma_mem = NULL;
}
-static const struct reserved_mem_ops rmem_dma_ops = {
- .device_init = rmem_dma_device_init,
- .device_release = rmem_dma_device_release,
-};
-static int __init rmem_dma_setup(struct reserved_mem *rmem)
+static int __init rmem_dma_setup(unsigned long node, struct reserved_mem *rmem)
{
- unsigned long node = rmem->fdt_node;
-
if (of_get_flat_dt_prop(node, "reusable", NULL))
- return -EINVAL;
+ return -ENODEV;
#ifdef CONFIG_ARM
if (!of_get_flat_dt_prop(node, "no-map", NULL)) {
@@ -390,7 +384,6 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
}
#endif
- rmem->ops = &rmem_dma_ops;
pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
return 0;
@@ -407,5 +400,11 @@ static int __init dma_init_reserved_memory(void)
core_initcall(dma_init_reserved_memory);
#endif /* CONFIG_DMA_GLOBAL_POOL */
-RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
+static const struct reserved_mem_ops rmem_dma_ops = {
+ .node_init = rmem_dma_setup,
+ .device_init = rmem_dma_device_init,
+ .device_release = rmem_dma_device_release,
+};
+
+RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", &rmem_dma_ops);
#endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index c56004d314dc..03f52bd17120 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -42,7 +42,6 @@
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/sizes.h>
-#include <linux/dma-buf/heaps/cma.h>
#include <linux/dma-map-ops.h>
#include <linux/cma.h>
#include <linux/nospec.h>
@@ -53,7 +52,38 @@
#define CMA_SIZE_MBYTES 0
#endif
-struct cma *dma_contiguous_default_area;
+static struct cma *dma_contiguous_areas[MAX_CMA_AREAS];
+static unsigned int dma_contiguous_areas_num;
+
+static int dma_contiguous_insert_area(struct cma *cma)
+{
+ if (dma_contiguous_areas_num >= ARRAY_SIZE(dma_contiguous_areas))
+ return -EINVAL;
+
+ dma_contiguous_areas[dma_contiguous_areas_num++] = cma;
+
+ return 0;
+}
+
+/**
+ * dma_contiguous_get_area_by_idx() - Get contiguous area at given index
+ * @idx: index of the area we query
+ *
+ * Queries for the contiguous area located at index @idx.
+ *
+ * Returns:
+ * A pointer to the requested contiguous area, or NULL otherwise.
+ */
+struct cma *dma_contiguous_get_area_by_idx(unsigned int idx)
+{
+ if (idx >= dma_contiguous_areas_num)
+ return NULL;
+
+ return dma_contiguous_areas[idx];
+}
+EXPORT_SYMBOL_GPL(dma_contiguous_get_area_by_idx);
+
+static struct cma *dma_contiguous_default_area;
/*
* Default global CMA area size can be defined in kernel's .config.
@@ -91,15 +121,14 @@ static int __init early_cma(char *p)
}
early_param("cma", early_cma);
-/*
- * cma_skip_dt_default_reserved_mem - This is called from the
- * reserved_mem framework to detect if the default cma region is being
- * set by the "cma=" kernel parameter.
- */
-bool __init cma_skip_dt_default_reserved_mem(void)
+struct cma *dev_get_cma_area(struct device *dev)
{
- return size_cmdline != -1;
+ if (dev && dev->cma_area)
+ return dev->cma_area;
+
+ return dma_contiguous_default_area;
}
+EXPORT_SYMBOL_GPL(dev_get_cma_area);
#ifdef CONFIG_DMA_NUMA_CMA
@@ -264,9 +293,24 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
if (ret)
return;
- ret = dma_heap_cma_register_heap(dma_contiguous_default_area);
+ /*
+ * We need to insert the new area in our list to avoid
+ * any inconsistencies between having the default area
+ * listed in the DT or not.
+ *
+ * The DT case is handled by rmem_cma_setup() and will
+ * always insert all its areas in our list. However, if
+ * it didn't run (because OF_RESERVED_MEM isn't set, or
+ * there's no DT region specified), then we don't have a
+ * default area yet, and no area in our list.
+ *
+ * This block creates the default area in such a case,
+ * but we also need to insert it in our list to avoid
+ * having a default area but an empty list.
+ */
+ ret = dma_contiguous_insert_area(dma_contiguous_default_area);
if (ret)
- pr_warn("Couldn't register default CMA heap.");
+ pr_warn("Couldn't queue default CMA region for heap creation.");
}
}
@@ -470,47 +514,89 @@ static void rmem_cma_device_release(struct reserved_mem *rmem,
dev->cma_area = NULL;
}
-static const struct reserved_mem_ops rmem_cma_ops = {
- .device_init = rmem_cma_device_init,
- .device_release = rmem_cma_device_release,
-};
+static int __init __rmem_cma_verify_node(unsigned long node)
+{
+ if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
+ of_get_flat_dt_prop(node, "no-map", NULL))
+ return -ENODEV;
+
+ if (size_cmdline != -1 &&
+ of_get_flat_dt_prop(node, "linux,cma-default", NULL)) {
+ pr_err("Skipping dt linux,cma-default node in favor for \"cma=\" kernel param.\n");
+ return -EBUSY;
+ }
+ return 0;
+}
-static int __init rmem_cma_setup(struct reserved_mem *rmem)
+static int __init rmem_cma_validate(unsigned long node, phys_addr_t *align)
+{
+ int ret = __rmem_cma_verify_node(node);
+
+ if (ret)
+ return ret;
+
+ if (align)
+ *align = max_t(phys_addr_t, *align, CMA_MIN_ALIGNMENT_BYTES);
+
+ return 0;
+}
+
+static int __init rmem_cma_fixup(unsigned long node, phys_addr_t base,
+ phys_addr_t size)
+{
+ int ret = __rmem_cma_verify_node(node);
+
+ if (ret)
+ return ret;
+
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(base, size);
+ return 0;
+}
+
+static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem)
{
- unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
struct cma *cma;
- int err;
+ int ret;
- if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
- of_get_flat_dt_prop(node, "no-map", NULL))
- return -EINVAL;
+ ret = __rmem_cma_verify_node(node);
+ if (ret)
+ return ret;
if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
pr_err("Reserved memory: incorrect alignment of CMA region\n");
return -EINVAL;
}
- err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma);
- if (err) {
+ ret = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma);
+ if (ret) {
pr_err("Reserved memory: unable to setup CMA region\n");
- return err;
+ return ret;
}
if (default_cma)
dma_contiguous_default_area = cma;
- rmem->ops = &rmem_cma_ops;
rmem->priv = cma;
pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
- err = dma_heap_cma_register_heap(cma);
- if (err)
- pr_warn("Couldn't register CMA heap.");
+ ret = dma_contiguous_insert_area(cma);
+ if (ret)
+ pr_warn("Couldn't store CMA reserved area.");
return 0;
}
-RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup);
+
+static const struct reserved_mem_ops rmem_cma_ops = {
+ .node_validate = rmem_cma_validate,
+ .node_fixup = rmem_cma_fixup,
+ .node_init = rmem_cma_setup,
+ .device_init = rmem_cma_device_init,
+ .device_release = rmem_cma_device_release,
+};
+
+RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", &rmem_cma_ops);
#endif
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 86f87e43438c..1a725edbbbf6 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -453,7 +453,7 @@ static int active_cacheline_set_overlap(phys_addr_t cln, int overlap)
return overlap;
}
-static void active_cacheline_inc_overlap(phys_addr_t cln)
+static void active_cacheline_inc_overlap(phys_addr_t cln, bool is_cache_clean)
{
int overlap = active_cacheline_read_overlap(cln);
@@ -462,7 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
/* If we overflowed the overlap counter then we're potentially
* leaking dma-mappings.
*/
- WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
+ WARN_ONCE(!is_cache_clean && overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
}
@@ -495,7 +495,7 @@ static int active_cacheline_insert(struct dma_debug_entry *entry,
if (rc == -EEXIST) {
struct dma_debug_entry *existing;
- active_cacheline_inc_overlap(cln);
+ active_cacheline_inc_overlap(cln, entry->is_cache_clean);
existing = radix_tree_lookup(&dma_active_cacheline, cln);
/* A lookup failure here after we got -EEXIST is unexpected. */
WARN_ON(!existing);
@@ -601,7 +601,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
unsigned long flags;
int rc;
- entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN);
+ entry->is_cache_clean = attrs & (DMA_ATTR_DEBUGGING_IGNORE_CACHELINES |
+ DMA_ATTR_REQUIRE_COHERENT);
bucket = get_hash_bucket(entry, &flags);
hash_bucket_add(bucket, entry);
@@ -614,6 +615,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
} else if (rc == -EEXIST &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
!(entry->is_cache_clean && overlap_cache_clean) &&
+ dma_get_cache_alignment() >= L1_CACHE_BYTES &&
!(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
is_swiotlb_active(entry->dev))) {
err_printk(entry->dev, entry,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8f43a930716d..ec887f443741 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -406,6 +406,8 @@ void dma_direct_sync_sg_for_device(struct device *dev,
arch_sync_dma_for_device(paddr, sg->length,
dir);
}
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_flush();
}
#endif
@@ -427,8 +429,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
}
- if (!dev_is_dma_coherent(dev))
+ if (!dev_is_dma_coherent(dev)) {
+ arch_sync_dma_flush();
arch_sync_dma_for_cpu_all();
+ }
}
/*
@@ -440,14 +444,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
{
struct scatterlist *sg;
int i;
+ bool need_sync = false;
for_each_sg(sgl, sg, nents, i) {
- if (sg_dma_is_bus_address(sg))
+ if (sg_dma_is_bus_address(sg)) {
sg_dma_unmark_bus_address(sg);
- else
+ } else {
+ need_sync = true;
dma_direct_unmap_phys(dev, sg->dma_address,
- sg_dma_len(sg), dir, attrs);
+ sg_dma_len(sg), dir, attrs, false);
+ }
}
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_flush();
}
#endif
@@ -457,6 +466,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct pci_p2pdma_map_state p2pdma_state = {};
struct scatterlist *sg;
int i, ret;
+ bool need_sync = false;
for_each_sg(sgl, sg, nents, i) {
switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -468,8 +478,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
*/
break;
case PCI_P2PDMA_MAP_NONE:
+ need_sync = true;
sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
- sg->length, dir, attrs);
+ sg->length, dir, attrs, false);
if (sg->dma_address == DMA_MAPPING_ERROR) {
ret = -EIO;
goto out_unmap;
@@ -488,6 +499,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
sg_dma_len(sg) = sg->length;
}
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_flush();
return nents;
out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index f476c63b668c..7140c208c123 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -60,17 +60,22 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
swiotlb_sync_single_for_device(dev, paddr, size, dir);
- if (!dev_is_dma_coherent(dev))
+ if (!dev_is_dma_coherent(dev)) {
arch_sync_dma_for_device(paddr, size, dir);
+ arch_sync_dma_flush();
+ }
}
static inline void dma_direct_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
+ dma_addr_t addr, size_t size, enum dma_data_direction dir,
+ bool flush)
{
phys_addr_t paddr = dma_to_phys(dev, addr);
if (!dev_is_dma_coherent(dev)) {
arch_sync_dma_for_cpu(paddr, size, dir);
+ if (flush)
+ arch_sync_dma_flush();
arch_sync_dma_for_cpu_all();
}
@@ -79,26 +84,35 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
static inline dma_addr_t dma_direct_map_phys(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+ unsigned long attrs, bool flush)
{
dma_addr_t dma_addr;
if (is_swiotlb_force_bounce(dev)) {
- if (attrs & DMA_ATTR_MMIO)
- goto err_overflow;
+ if (!(attrs & DMA_ATTR_CC_SHARED)) {
+ if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
+ return DMA_MAPPING_ERROR;
- return swiotlb_map(dev, phys, size, dir, attrs);
+ return swiotlb_map(dev, phys, size, dir, attrs);
+ }
+ } else if (attrs & DMA_ATTR_CC_SHARED) {
+ return DMA_MAPPING_ERROR;
}
if (attrs & DMA_ATTR_MMIO) {
dma_addr = phys;
if (unlikely(!dma_capable(dev, dma_addr, size, false)))
goto err_overflow;
+ } else if (attrs & DMA_ATTR_CC_SHARED) {
+ dma_addr = phys_to_dma_unencrypted(dev, phys);
+ if (unlikely(!dma_capable(dev, dma_addr, size, false)))
+ goto err_overflow;
} else {
dma_addr = phys_to_dma(dev, phys);
if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
dma_kmalloc_needs_bounce(dev, size, dir)) {
- if (is_swiotlb_active(dev))
+ if (is_swiotlb_active(dev) &&
+ !(attrs & DMA_ATTR_REQUIRE_COHERENT))
return swiotlb_map(dev, phys, size, dir, attrs);
goto err_overflow;
@@ -106,8 +120,11 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
}
if (!dev_is_dma_coherent(dev) &&
- !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
arch_sync_dma_for_device(phys, size, dir);
+ if (flush)
+ arch_sync_dma_flush();
+ }
return dma_addr;
err_overflow:
@@ -119,17 +136,18 @@ err_overflow:
}
static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
+ size_t size, enum dma_data_direction dir, unsigned long attrs,
+ bool flush)
{
phys_addr_t phys;
- if (attrs & DMA_ATTR_MMIO)
+ if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
/* nothing to do: uncached and no swiotlb */
return;
phys = dma_to_phys(dev, addr);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir, flush);
swiotlb_tbl_unmap_single(dev, phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 0f33b3ea7daf..29eeb5fdf199 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -5,6 +5,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cleanup.h>
#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/device.h>
@@ -15,6 +16,7 @@
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
+#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/timekeeping.h>
#include <uapi/linux/map_benchmark.h>
@@ -31,17 +33,219 @@ struct map_benchmark_data {
atomic64_t loops;
};
+struct map_benchmark_ops {
+ void *(*prepare)(struct map_benchmark_data *map);
+ void (*unprepare)(void *mparam);
+ void (*initialize_data)(void *mparam);
+ int (*do_map)(void *mparam);
+ void (*do_unmap)(void *mparam);
+};
+
+struct dma_single_map_param {
+ struct device *dev;
+ dma_addr_t addr;
+ void *xbuf;
+ u32 npages;
+ u32 dma_dir;
+};
+
+static void *dma_single_map_benchmark_prepare(struct map_benchmark_data *map)
+{
+ struct dma_single_map_param *params __free(kfree) = kzalloc(sizeof(*params),
+ GFP_KERNEL);
+ if (!params)
+ return NULL;
+
+ params->npages = map->bparam.granule;
+ params->dma_dir = map->bparam.dma_dir;
+ params->dev = map->dev;
+ params->xbuf = alloc_pages_exact(params->npages * PAGE_SIZE, GFP_KERNEL);
+ if (!params->xbuf)
+ return NULL;
+
+ return_ptr(params);
+}
+
+static void dma_single_map_benchmark_unprepare(void *mparam)
+{
+ struct dma_single_map_param *params = mparam;
+
+ free_pages_exact(params->xbuf, params->npages * PAGE_SIZE);
+ kfree(params);
+}
+
+static void dma_single_map_benchmark_initialize_data(void *mparam)
+{
+ struct dma_single_map_param *params = mparam;
+
+ /*
+ * for a non-coherent device, if we don't stain them in the
+ * cache, this will give an underestimate of the real-world
+ * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
+ * 66 means everything goes well! 66 is lucky.
+ */
+ if (params->dma_dir != DMA_FROM_DEVICE)
+ memset(params->xbuf, 0x66, params->npages * PAGE_SIZE);
+}
+
+static int dma_single_map_benchmark_do_map(void *mparam)
+{
+ struct dma_single_map_param *params = mparam;
+
+ params->addr = dma_map_single(params->dev, params->xbuf,
+ params->npages * PAGE_SIZE, params->dma_dir);
+ if (unlikely(dma_mapping_error(params->dev, params->addr))) {
+ pr_err("dma_map_single failed on %s\n", dev_name(params->dev));
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void dma_single_map_benchmark_do_unmap(void *mparam)
+{
+ struct dma_single_map_param *params = mparam;
+
+ dma_unmap_single(params->dev, params->addr,
+ params->npages * PAGE_SIZE, params->dma_dir);
+}
+
+static struct map_benchmark_ops dma_single_map_benchmark_ops = {
+ .prepare = dma_single_map_benchmark_prepare,
+ .unprepare = dma_single_map_benchmark_unprepare,
+ .initialize_data = dma_single_map_benchmark_initialize_data,
+ .do_map = dma_single_map_benchmark_do_map,
+ .do_unmap = dma_single_map_benchmark_do_unmap,
+};
+
+struct dma_sg_map_param {
+ struct sg_table sgt;
+ struct device *dev;
+ void **buf;
+ u32 npages;
+ u32 dma_dir;
+};
+
+static void *dma_sg_map_benchmark_prepare(struct map_benchmark_data *map)
+{
+ struct scatterlist *sg;
+ int i;
+
+ struct dma_sg_map_param *params = kzalloc(sizeof(*params), GFP_KERNEL);
+
+ if (!params)
+ return NULL;
+ /*
+ * Set the number of scatterlist entries based on the granule.
+ * In SG mode, 'granule' represents the number of scatterlist entries.
+ * Each scatterlist entry corresponds to a single page.
+ */
+ params->npages = map->bparam.granule;
+ params->dma_dir = map->bparam.dma_dir;
+ params->dev = map->dev;
+ params->buf = kmalloc_array(params->npages, sizeof(*params->buf),
+ GFP_KERNEL);
+ if (!params->buf)
+ goto out;
+
+ if (sg_alloc_table(&params->sgt, params->npages, GFP_KERNEL))
+ goto free_buf;
+
+ for_each_sgtable_sg(&params->sgt, sg, i) {
+ params->buf[i] = (void *)__get_free_page(GFP_KERNEL);
+ if (!params->buf[i])
+ goto free_page;
+
+ sg_set_buf(sg, params->buf[i], PAGE_SIZE);
+ }
+
+ return params;
+
+free_page:
+ while (i-- > 0)
+ free_page((unsigned long)params->buf[i]);
+
+ sg_free_table(&params->sgt);
+free_buf:
+ kfree(params->buf);
+out:
+ kfree(params);
+ return NULL;
+}
+
+static void dma_sg_map_benchmark_unprepare(void *mparam)
+{
+ struct dma_sg_map_param *params = mparam;
+ int i;
+
+ for (i = 0; i < params->npages; i++)
+ free_page((unsigned long)params->buf[i]);
+
+ sg_free_table(&params->sgt);
+
+ kfree(params->buf);
+ kfree(params);
+}
+
+static void dma_sg_map_benchmark_initialize_data(void *mparam)
+{
+ struct dma_sg_map_param *params = mparam;
+ struct scatterlist *sg;
+ int i = 0;
+
+ if (params->dma_dir == DMA_FROM_DEVICE)
+ return;
+
+ for_each_sgtable_sg(&params->sgt, sg, i)
+ memset(params->buf[i], 0x66, PAGE_SIZE);
+}
+
+static int dma_sg_map_benchmark_do_map(void *mparam)
+{
+ struct dma_sg_map_param *params = mparam;
+ int ret = 0;
+
+ int sg_mapped = dma_map_sg(params->dev, params->sgt.sgl,
+ params->npages, params->dma_dir);
+ if (!sg_mapped) {
+ pr_err("dma_map_sg failed on %s\n", dev_name(params->dev));
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+
+static void dma_sg_map_benchmark_do_unmap(void *mparam)
+{
+ struct dma_sg_map_param *params = mparam;
+
+ dma_unmap_sg(params->dev, params->sgt.sgl, params->npages,
+ params->dma_dir);
+}
+
+static struct map_benchmark_ops dma_sg_map_benchmark_ops = {
+ .prepare = dma_sg_map_benchmark_prepare,
+ .unprepare = dma_sg_map_benchmark_unprepare,
+ .initialize_data = dma_sg_map_benchmark_initialize_data,
+ .do_map = dma_sg_map_benchmark_do_map,
+ .do_unmap = dma_sg_map_benchmark_do_unmap,
+};
+
+static struct map_benchmark_ops *dma_map_benchmark_ops[DMA_MAP_BENCH_MODE_MAX] = {
+ [DMA_MAP_BENCH_SINGLE_MODE] = &dma_single_map_benchmark_ops,
+ [DMA_MAP_BENCH_SG_MODE] = &dma_sg_map_benchmark_ops,
+};
+
static int map_benchmark_thread(void *data)
{
- void *buf;
- dma_addr_t dma_addr;
struct map_benchmark_data *map = data;
- int npages = map->bparam.granule;
- u64 size = npages * PAGE_SIZE;
+ __u8 map_mode = map->bparam.map_mode;
int ret = 0;
- buf = alloc_pages_exact(size, GFP_KERNEL);
- if (!buf)
+ struct map_benchmark_ops *mb_ops = dma_map_benchmark_ops[map_mode];
+ void *mparam = mb_ops->prepare(map);
+
+ if (!mparam)
return -ENOMEM;
while (!kthread_should_stop()) {
@@ -49,23 +253,12 @@ static int map_benchmark_thread(void *data)
ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
ktime_t map_delta, unmap_delta;
- /*
- * for a non-coherent device, if we don't stain them in the
- * cache, this will give an underestimate of the real-world
- * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
- * 66 means evertything goes well! 66 is lucky.
- */
- if (map->dir != DMA_FROM_DEVICE)
- memset(buf, 0x66, size);
-
+ mb_ops->initialize_data(mparam);
map_stime = ktime_get();
- dma_addr = dma_map_single(map->dev, buf, size, map->dir);
- if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
- pr_err("dma_map_single failed on %s\n",
- dev_name(map->dev));
- ret = -ENOMEM;
+ ret = mb_ops->do_map(mparam);
+ if (ret)
goto out;
- }
+
map_etime = ktime_get();
map_delta = ktime_sub(map_etime, map_stime);
@@ -73,7 +266,8 @@ static int map_benchmark_thread(void *data)
ndelay(map->bparam.dma_trans_ns);
unmap_stime = ktime_get();
- dma_unmap_single(map->dev, dma_addr, size, map->dir);
+ mb_ops->do_unmap(mparam);
+
unmap_etime = ktime_get();
unmap_delta = ktime_sub(unmap_etime, unmap_stime);
@@ -108,7 +302,7 @@ static int map_benchmark_thread(void *data)
}
out:
- free_pages_exact(buf, size);
+ mb_ops->unprepare(mparam);
return ret;
}
@@ -209,6 +403,12 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case DMA_MAP_BENCHMARK:
+ if (map->bparam.map_mode < 0 ||
+ map->bparam.map_mode >= DMA_MAP_BENCH_MODE_MAX) {
+ pr_err("invalid map mode\n");
+ return -EINVAL;
+ }
+
if (map->bparam.threads == 0 ||
map->bparam.threads > DMA_MAP_MAX_THREADS) {
pr_err("invalid thread number\n");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 3928a509c44c..23ed8eb9233e 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -157,6 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
bool is_mmio = attrs & DMA_ATTR_MMIO;
+ bool is_cc_shared = attrs & DMA_ATTR_CC_SHARED;
dma_addr_t addr = DMA_MAPPING_ERROR;
BUG_ON(!valid_dma_direction(dir));
@@ -164,9 +165,15 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
+ if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+ return DMA_MAPPING_ERROR;
+
if (dma_map_direct(dev, ops) ||
- (!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
- addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
+ (!is_mmio && !is_cc_shared &&
+ arch_dma_map_phys_direct(dev, phys + size)))
+ addr = dma_direct_map_phys(dev, phys, size, dir, attrs, true);
+ else if (is_cc_shared)
+ return DMA_MAPPING_ERROR;
else if (use_dma_iommu(dev))
addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
else if (ops->map_phys)
@@ -203,11 +210,16 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
bool is_mmio = attrs & DMA_ATTR_MMIO;
+ bool is_cc_shared = attrs & DMA_ATTR_CC_SHARED;
BUG_ON(!valid_dma_direction(dir));
+
if (dma_map_direct(dev, ops) ||
- (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size)))
- dma_direct_unmap_phys(dev, addr, size, dir, attrs);
+ (!is_mmio && !is_cc_shared &&
+ arch_dma_unmap_phys_direct(dev, addr + size)))
+ dma_direct_unmap_phys(dev, addr, size, dir, attrs, true);
+ else if (is_cc_shared)
+ return;
else if (use_dma_iommu(dev))
iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
else if (ops->unmap_phys)
@@ -235,6 +247,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
+ if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+ return -EOPNOTSUPP;
+
if (WARN_ON_ONCE(!dev->dma_mask))
return 0;
@@ -373,7 +388,7 @@ void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir, true);
else if (use_dma_iommu(dev))
iommu_dma_sync_single_for_cpu(dev, addr, size, dir);
else if (ops->sync_single_for_cpu)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d8e6f1d889d5..9a15e7231e39 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <linux/kmsan-checks.h>
#include <linux/iommu-helper.h>
#include <linux/init.h>
#include <linux/memblock.h>
@@ -867,6 +868,9 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
if (orig_addr == INVALID_PHYS_ADDR)
return;
+ if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev))
+ arch_sync_dma_flush();
+
/*
* It's valid for tlb_offset to be negative. This can happen when the
* "offset" returned by swiotlb_align_offset() is non-zero, and the
@@ -901,10 +905,19 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
local_irq_save(flags);
page = pfn_to_page(pfn);
- if (dir == DMA_TO_DEVICE)
+ if (dir == DMA_TO_DEVICE) {
+ /*
+ * Ideally, kmsan_check_highmem_page()
+ * could be used here to detect infoleaks,
+ * but callers may map uninitialized buffers
+ * that will be written by the device,
+ * causing false positives.
+ */
memcpy_from_page(vaddr, page, offset, sz);
- else
+ } else {
+ kmsan_unpoison_memory(vaddr, sz);
memcpy_to_page(page, offset, vaddr, sz);
+ }
local_irq_restore(flags);
size -= sz;
@@ -913,8 +926,15 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
offset = 0;
}
} else if (dir == DMA_TO_DEVICE) {
+ /*
+ * Ideally, kmsan_check_memory() could be used here to detect
+ * infoleaks (uninitialized data being sent to device), but
+ * callers may map uninitialized buffers that will be written
+ * by the device, causing false positives.
+ */
memcpy(vaddr, phys_to_virt(orig_addr), size);
} else {
+ kmsan_unpoison_memory(vaddr, size);
memcpy(phys_to_virt(orig_addr), vaddr, size);
}
}
@@ -1595,8 +1615,10 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
return DMA_MAPPING_ERROR;
}
- if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
arch_sync_dma_for_device(swiotlb_addr, size, dir);
+ arch_sync_dma_flush();
+ }
return dma_addr;
}
@@ -1855,26 +1877,25 @@ static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
dev->dma_io_tlb_mem = &io_tlb_default_mem;
}
-static const struct reserved_mem_ops rmem_swiotlb_ops = {
- .device_init = rmem_swiotlb_device_init,
- .device_release = rmem_swiotlb_device_release,
-};
-
-static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+static int __init rmem_swiotlb_setup(unsigned long node,
+ struct reserved_mem *rmem)
{
- unsigned long node = rmem->fdt_node;
-
if (of_get_flat_dt_prop(node, "reusable", NULL) ||
of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
- rmem->ops = &rmem_swiotlb_ops;
pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
return 0;
}
-RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+ .node_init = rmem_swiotlb_setup,
+ .device_init = rmem_swiotlb_device_init,
+ .device_release = rmem_swiotlb_device_release,
+};
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", &rmem_swiotlb_ops);
#endif /* CONFIG_DMA_RESTRICTED_POOL */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9ef63e414791..19d2244a9fef 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -47,10 +47,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
*/
while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
- local_irq_enable_exit_to_user(ti_work);
+ local_irq_enable();
if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
- if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
+ if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY))
schedule();
}
@@ -74,7 +74,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
* might have changed while interrupts and preemption was
* enabled above.
*/
- local_irq_disable_exit_to_user();
+ local_irq_disable();
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
@@ -105,70 +105,16 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
{
- irqentry_state_t ret = {
- .exit_rcu = false,
- };
-
if (user_mode(regs)) {
- irqentry_enter_from_user_mode(regs);
- return ret;
- }
+ irqentry_state_t ret = {
+ .exit_rcu = false,
+ };
- /*
- * If this entry hit the idle task invoke ct_irq_enter() whether
- * RCU is watching or not.
- *
- * Interrupts can nest when the first interrupt invokes softirq
- * processing on return which enables interrupts.
- *
- * Scheduler ticks in the idle task can mark quiescent state and
- * terminate a grace period, if and only if the timer interrupt is
- * not nested into another interrupt.
- *
- * Checking for rcu_is_watching() here would prevent the nesting
- * interrupt to invoke ct_irq_enter(). If that nested interrupt is
- * the tick then rcu_flavor_sched_clock_irq() would wrongfully
- * assume that it is the first interrupt and eventually claim
- * quiescent state and end grace periods prematurely.
- *
- * Unconditionally invoke ct_irq_enter() so RCU state stays
- * consistent.
- *
- * TINY_RCU does not support EQS, so let the compiler eliminate
- * this part when enabled.
- */
- if (!IS_ENABLED(CONFIG_TINY_RCU) &&
- (is_idle_task(current) || arch_in_rcu_eqs())) {
- /*
- * If RCU is not watching then the same careful
- * sequence vs. lockdep and tracing is required
- * as in irqentry_enter_from_user_mode().
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- ct_irq_enter();
- instrumentation_begin();
- kmsan_unpoison_entry_regs(regs);
- trace_hardirqs_off_finish();
- instrumentation_end();
-
- ret.exit_rcu = true;
+ irqentry_enter_from_user_mode(regs);
return ret;
}
- /*
- * If RCU is watching then RCU only wants to check whether it needs
- * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
- * already contains a warning when RCU is not watching, so no point
- * in having another one here.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- instrumentation_begin();
- kmsan_unpoison_entry_regs(regs);
- rcu_irq_enter_check_tick();
- trace_hardirqs_off_finish();
- instrumentation_end();
-
- return ret;
+ return irqentry_enter_from_kernel_mode(regs);
}
/**
@@ -212,43 +158,10 @@ void dynamic_irqentry_exit_cond_resched(void)
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
- lockdep_assert_irqs_disabled();
-
- /* Check whether this returns to user mode */
- if (user_mode(regs)) {
+ if (user_mode(regs))
irqentry_exit_to_user_mode(regs);
- } else if (!regs_irqs_disabled(regs)) {
- /*
- * If RCU was not watching on entry this needs to be done
- * carefully and needs the same ordering of lockdep/tracing
- * and RCU as the return to user mode path.
- */
- if (state.exit_rcu) {
- instrumentation_begin();
- /* Tell the tracer that IRET will enable interrupts */
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare();
- instrumentation_end();
- ct_irq_exit();
- lockdep_hardirqs_on(CALLER_ADDR0);
- return;
- }
-
- instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION))
- irqentry_exit_cond_resched();
-
- /* Covers both tracing and lockdep */
- trace_hardirqs_on();
- instrumentation_end();
- } else {
- /*
- * IRQ flags state is correct already. Just tell RCU if it
- * was not watching on entry.
- */
- if (state.exit_rcu)
- ct_irq_exit();
- }
+ else
+ irqentry_exit_to_kernel_mode(regs, state);
}
irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac70d68217b6..6d1f8bad7e1c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4138,7 +4138,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (*perf_event_fasync(event))
event->pending_kill = POLL_ERR;
- perf_event_wakeup(event);
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending_irq);
} else {
struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
@@ -4812,7 +4813,7 @@ static void __perf_event_read(void *info)
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
- struct pmu *pmu = event->pmu;
+ struct pmu *pmu;
/*
* If this is a task context, we need to check whether it is
@@ -4824,7 +4825,7 @@ static void __perf_event_read(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- raw_spin_lock(&ctx->lock);
+ guard(raw_spinlock)(&ctx->lock);
ctx_time_update_event(ctx, event);
perf_event_update_time(event);
@@ -4832,25 +4833,22 @@ static void __perf_event_read(void *info)
perf_event_update_sibling_time(event);
if (event->state != PERF_EVENT_STATE_ACTIVE)
- goto unlock;
+ return;
if (!data->group) {
- pmu->read(event);
+ perf_pmu_read(event);
data->ret = 0;
- goto unlock;
+ return;
}
+ pmu = event->pmu_ctx->pmu;
pmu->start_txn(pmu, PERF_PMU_TXN_READ);
- pmu->read(event);
-
+ perf_pmu_read(event);
for_each_sibling_event(sub, event)
perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
-
-unlock:
- raw_spin_unlock(&ctx->lock);
}
static inline u64 perf_event_count(struct perf_event *event, bool self)
@@ -5370,15 +5368,15 @@ static void unaccount_freq_event(void)
static struct perf_ctx_data *
-alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global, gfp_t gfp_flags)
{
struct perf_ctx_data *cd;
- cd = kzalloc_obj(*cd);
+ cd = kzalloc_obj(*cd, gfp_flags);
if (!cd)
return NULL;
- cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+ cd->data = kmem_cache_zalloc(ctx_cache, gfp_flags);
if (!cd->data) {
kfree(cd);
return NULL;
@@ -5412,11 +5410,11 @@ static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
static int
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
- bool global)
+ bool global, gfp_t gfp_flags)
{
struct perf_ctx_data *cd, *old = NULL;
- cd = alloc_perf_ctx_data(ctx_cache, global);
+ cd = alloc_perf_ctx_data(ctx_cache, global, gfp_flags);
if (!cd)
return -ENOMEM;
@@ -5489,6 +5487,12 @@ again:
cd = NULL;
}
if (!cd) {
+ /*
+ * Try to allocate context quickly before
+ * traversing the whole thread list again.
+ */
+ if (!attach_task_ctx_data(p, ctx_cache, true, GFP_NOWAIT))
+ continue;
get_task_struct(p);
goto alloc;
}
@@ -5499,7 +5503,7 @@ again:
return 0;
alloc:
- ret = attach_task_ctx_data(p, ctx_cache, true);
+ ret = attach_task_ctx_data(p, ctx_cache, true, GFP_KERNEL);
put_task_struct(p);
if (ret) {
__detach_global_ctx_data();
@@ -5519,7 +5523,7 @@ attach_perf_ctx_data(struct perf_event *event)
return -ENOMEM;
if (task)
- return attach_task_ctx_data(task, ctx_cache, false);
+ return attach_task_ctx_data(task, ctx_cache, false, GFP_KERNEL);
ret = attach_global_ctx_data(ctx_cache);
if (ret)
@@ -5554,22 +5558,15 @@ static void __detach_global_ctx_data(void)
struct task_struct *g, *p;
struct perf_ctx_data *cd;
-again:
scoped_guard (rcu) {
for_each_process_thread(g, p) {
cd = rcu_dereference(p->perf_ctx_data);
- if (!cd || !cd->global)
- continue;
- cd->global = 0;
- get_task_struct(p);
- goto detach;
+ if (cd && cd->global) {
+ cd->global = 0;
+ detach_task_ctx_data(p);
+ }
}
}
- return;
-detach:
- detach_task_ctx_data(p);
- put_task_struct(p);
- goto again;
}
static void detach_global_ctx_data(void)
@@ -7215,7 +7212,7 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
#ifdef CONFIG_MMU
/* Clear any partial mappings on error. */
if (err)
- zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
+ zap_vma_range(vma, vma->vm_start, nr_pages * PAGE_SIZE);
#endif
return err;
@@ -7464,28 +7461,28 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = perf_mmap_aux(vma, event, nr_pages);
if (ret)
return ret;
- }
- /*
- * Since pinned accounting is per vm we cannot allow fork() to copy our
- * vma.
- */
- vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
- vma->vm_ops = &perf_mmap_vmops;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
+ vma->vm_ops = &perf_mmap_vmops;
- mapped = get_mapped(event, event_mapped);
- if (mapped)
- mapped(event, vma->vm_mm);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
- /*
- * Try to map it into the page table. On fail, invoke
- * perf_mmap_close() to undo the above, as the callsite expects
- * full cleanup in this case and therefore does not invoke
- * vmops::close().
- */
- ret = map_range(event->rb, vma);
- if (ret)
- perf_mmap_close(vma);
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(event->rb, vma);
+ if (ret)
+ perf_mmap_close(vma);
+ }
return ret;
}
@@ -8422,7 +8419,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
pte_t *ptep, pte;
pgdp = pgd_offset(mm, addr);
- pgd = READ_ONCE(*pgdp);
+ pgd = pgdp_get(pgdp);
if (pgd_none(pgd))
return 0;
@@ -8430,7 +8427,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pgd_leaf_size(pgd);
p4dp = p4d_offset_lockless(pgdp, pgd, addr);
- p4d = READ_ONCE(*p4dp);
+ p4d = p4dp_get(p4dp);
if (!p4d_present(p4d))
return 0;
@@ -8438,7 +8435,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return p4d_leaf_size(p4d);
pudp = pud_offset_lockless(p4dp, p4d, addr);
- pud = READ_ONCE(*pudp);
+ pud = pudp_get(pudp);
if (!pud_present(pud))
return 0;
@@ -9240,7 +9237,7 @@ perf_event_alloc_task_data(struct task_struct *child,
return;
attach:
- attach_task_ctx_data(child, ctx_cache, true);
+ attach_task_ctx_data(child, ctx_cache, true, GFP_KERNEL);
}
void perf_event_fork(struct task_struct *task)
@@ -10776,6 +10773,13 @@ int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
+ /*
+ * Entry point from hardware PMI, interrupts should be disabled here.
+ * This serializes us against perf_event_remove_from_context() in
+ * things like perf_event_release_kernel().
+ */
+ lockdep_assert_irqs_disabled();
+
return __perf_event_overflow(event, 1, data, regs);
}
@@ -10852,6 +10856,19 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
{
struct hw_perf_event *hwc = &event->hw;
+ /*
+ * This is:
+ * - software preempt
+ * - tracepoint preempt
+ * - tp_target_task irq (ctx->lock)
+ * - uprobes preempt/irq
+ * - kprobes preempt/irq
+ * - hw_breakpoint irq
+ *
+ * Any of these are sufficient to hold off RCU and thus ensure @event
+ * exists.
+ */
+ lockdep_assert_preemption_disabled();
local64_add(nr, &event->count);
if (!regs)
@@ -10860,6 +10877,16 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
if (!is_sampling_event(event))
return;
+ /*
+ * Serialize against event_function_call() IPIs like normal overflow
+ * event handling. Specifically, must not allow
+ * perf_event_release_kernel() -> perf_remove_from_context() to make
+ * progress and 'release' the event from under us.
+ */
+ guard(irqsave)();
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
data->period = nr;
return perf_swevent_overflow(event, 1, data, regs);
@@ -11358,6 +11385,11 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct perf_sample_data data;
struct perf_event *event;
+ /*
+ * Per being a tracepoint, this runs with preemption disabled.
+ */
+ lockdep_assert_preemption_disabled();
+
struct perf_raw_record raw = {
.frag = {
.size = entry_size,
@@ -11690,6 +11722,11 @@ void perf_bp_event(struct perf_event *bp, void *data)
struct perf_sample_data sample;
struct pt_regs *regs = data;
+ /*
+ * Exception context, will have interrupts disabled.
+ */
+ lockdep_assert_irqs_disabled();
+
perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
if (!bp->hw.state && !perf_exclude_event(bp, regs))
@@ -12154,7 +12191,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && is_idle_task(current)))
- if (__perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
ret = HRTIMER_NORESTART;
}
@@ -14703,7 +14740,7 @@ inherit_event(struct perf_event *parent_event,
get_ctx(child_ctx);
child_event->ctx = child_ctx;
- pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ pmu_ctx = find_get_pmu_context(parent_event->pmu_ctx->pmu, child_ctx, child_event);
if (IS_ERR(pmu_ctx)) {
free_event(child_event);
return ERR_CAST(pmu_ctx);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 923b24b321cc..4084e926e284 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -344,7 +344,7 @@ out:
static void update_ref_ctr_warn(struct uprobe *uprobe,
struct mm_struct *mm, short d)
{
- pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
+ pr_warn("ref_ctr %s failed for inode: 0x%llx offset: "
"0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
(unsigned long long) uprobe->offset,
@@ -982,7 +982,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
static void
ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
- pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
+ pr_warn("ref_ctr_offset mismatch. inode: 0x%llx offset: 0x%llx "
"ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
(unsigned long long) cur_uprobe->ref_ctr_offset,
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a87021211ae..25e9cb6de7e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -608,7 +608,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
reaper = find_alive_thread(father);
if (reaper) {
- pid_ns->child_reaper = reaper;
+ ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper);
+ WRITE_ONCE(pid_ns->child_reaper, reaper);
return reaper;
}
@@ -748,14 +749,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->exit_state = EXIT_ZOMBIE;
if (unlikely(tsk->ptrace)) {
- int sig = thread_group_leader(tsk) &&
- thread_group_empty(tsk) &&
- !ptrace_reparented(tsk) ?
- tsk->exit_signal : SIGCHLD;
+ int sig = thread_group_empty(tsk) && !ptrace_reparented(tsk)
+ ? tsk->exit_signal : SIGCHLD;
autoreap = do_notify_parent(tsk, sig);
} else if (thread_group_leader(tsk)) {
autoreap = thread_group_empty(tsk) &&
- do_notify_parent(tsk, tsk->exit_signal);
+ do_notify_parent(tsk, tsk->exit_signal);
} else {
autoreap = true;
/* untraced sub-thread */
@@ -896,11 +895,16 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
+ struct kthread *kthread;
int group_dead;
WARN_ON(irqs_disabled());
WARN_ON(tsk->plug);
+ kthread = tsk_is_kthread(tsk);
+ if (unlikely(kthread))
+ kthread_do_exit(kthread, code);
+
kcov_task_exit(tsk);
kmsan_task_exit(tsk);
@@ -1013,6 +1017,7 @@ void __noreturn do_exit(long code)
lockdep_free_task(tsk);
do_task_dead();
}
+EXPORT_SYMBOL(do_exit);
void __noreturn make_task_dead(int signr)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index e832da9d15a4..f1ad69c6dc2d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -46,6 +46,7 @@
#include <linux/mm_inline.h>
#include <linux/memblock.h>
#include <linux/nsproxy.h>
+#include <linux/ns/ns_common_types.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
@@ -95,6 +96,7 @@
#include <linux/thread_info.h>
#include <linux/kstack_erase.h>
#include <linux/kasan.h>
+#include <linux/randomize_kstack.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
@@ -345,7 +347,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
stack = kasan_reset_tag(vm_area->addr);
/* Clear stale pointers from reused stack. */
- memset(stack, 0, THREAD_SIZE);
+ clear_pages(vm_area->addr, vm_area->nr_pages);
tsk->stack_vm_area = vm_area;
tsk->stack = stack;
@@ -1000,6 +1002,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid.cid = MM_CID_UNSET;
tsk->mm_cid.active = 0;
+ INIT_HLIST_NODE(&tsk->mm_cid.node);
#endif
return tsk;
@@ -1013,13 +1016,14 @@ free_tsk:
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
-static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
+static unsigned long coredump_filter = MMF_DUMP_FILTER_DEFAULT;
static int __init coredump_filter_setup(char *s)
{
- default_dump_filter =
- (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
- MMF_DUMP_FILTER_MASK;
+ if (kstrtoul(s, 0, &coredump_filter))
+ return 0;
+ coredump_filter <<= MMF_DUMP_FILTER_SHIFT;
+ coredump_filter &= MMF_DUMP_FILTER_MASK;
return 1;
}
@@ -1105,7 +1109,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
__mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
- __mm_flags_overwrite_word(mm, default_dump_filter);
+ __mm_flags_overwrite_word(mm, coredump_filter);
mm->def_flags = 0;
}
@@ -1586,7 +1590,6 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
- sched_mm_cid_fork(tsk);
return 0;
}
@@ -2028,6 +2031,41 @@ __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_AUTOREAP) {
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_PARENT)
+ return ERR_PTR(-EINVAL);
+ if (args->exit_signal)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if ((clone_flags & CLONE_PARENT) && current->signal->autoreap)
+ return ERR_PTR(-EINVAL);
+
+ if (clone_flags & CLONE_NNP) {
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ if (!(clone_flags & CLONE_PIDFD))
+ return ERR_PTR(-EINVAL);
+ if (!(clone_flags & CLONE_AUTOREAP))
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ /*
+ * Without CLONE_NNP the child could escalate privileges
+ * after being spawned, so require CAP_SYS_ADMIN.
+ * With CLONE_NNP the child can't gain new privileges,
+ * so allow unprivileged usage.
+ */
+ if (!(clone_flags & CLONE_NNP) &&
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -2076,6 +2114,7 @@ __latent_entropy struct task_struct *copy_process(
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
+ raw_spin_lock_init(&p->blocked_lock);
lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
@@ -2250,13 +2289,18 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+ unsigned flags = PIDFD_STALE;
+
+ if (clone_flags & CLONE_THREAD)
+ flags |= PIDFD_THREAD;
+ if (clone_flags & CLONE_PIDFD_AUTOKILL)
+ flags |= PIDFD_AUTOKILL;
/*
* Note that no task has been attached to @pid yet indicate
* that via CLONE_PIDFD.
*/
- retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
+ retval = pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2392,7 +2436,11 @@ __latent_entropy struct task_struct *copy_process(
rseq_fork(p, clone_flags);
- /* Don't start children in a dying pid namespace */
+ /*
+ * If zap_pid_ns_processes() was called after alloc_pid(), the new
+ * child missed SIGKILL. If current is not in the same namespace,
+ * we can't rely on fatal_signal_pending() below.
+ */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_core_free;
@@ -2412,6 +2460,9 @@ __latent_entropy struct task_struct *copy_process(
*/
copy_seccomp(p);
+ if (clone_flags & CLONE_NNP)
+ task_set_no_new_privs(p);
+
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2423,7 +2474,10 @@ __latent_entropy struct task_struct *copy_process(
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
- ns_of_pid(pid)->child_reaper = p;
+ struct pid_namespace *ns = ns_of_pid(pid);
+
+ ASSERT_EXCLUSIVE_WRITER(ns->child_reaper);
+ WRITE_ONCE(ns->child_reaper, p);
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->shared_pending.signal = delayed.signal;
@@ -2435,6 +2489,8 @@ __latent_entropy struct task_struct *copy_process(
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
+ if (clone_flags & CLONE_AUTOREAP)
+ p->signal->autoreap = 1;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_TGID);
@@ -2463,8 +2519,12 @@ __latent_entropy struct task_struct *copy_process(
fd_install(pidfd, pidfile);
proc_fork_connector(p);
- sched_post_fork(p);
+ /*
+ * sched_ext needs @p to be associated with its cgroup in its post_fork
+ * hook. cgroup_post_fork() should come before sched_post_fork().
+ */
cgroup_post_fork(p, args);
+ sched_post_fork(p);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
@@ -2498,7 +2558,6 @@ bad_fork_cleanup_namespaces:
exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
- sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
@@ -2620,6 +2679,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
pid_t nr;
/*
+ * Creating an empty mount namespace implies creating a new mount
+ * namespace. Set this before copy_process() so that the
+ * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
+ */
+ if (clone_flags & CLONE_EMPTY_MNTNS) {
+ clone_flags |= CLONE_NEWNS;
+ args->flags = clone_flags;
+ }
+
+ /*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
@@ -2897,7 +2966,9 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
- ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
+ CLONE_INTO_CGROUP | CLONE_AUTOREAP | CLONE_NNP |
+ CLONE_PIDFD_AUTOKILL | CLONE_EMPTY_MNTNS))
return false;
/*
@@ -3046,11 +3117,9 @@ void __init proc_caches_init(void)
*/
static int check_unshare_flags(unsigned long unshare_flags)
{
- if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
- CLONE_NEWTIME))
+ CLONE_NS_ALL | UNSHARE_EMPTY_MNTNS))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
@@ -3085,7 +3154,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
- if (fs->users == 1)
+ if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);
@@ -3149,6 +3218,8 @@ int ksys_unshare(unsigned long unshare_flags)
/*
* If unsharing namespace, must also unshare filesystem information.
*/
+ if (unshare_flags & UNSHARE_EMPTY_MNTNS)
+ unshare_flags |= CLONE_NEWNS;
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
@@ -3175,11 +3246,10 @@ int ksys_unshare(unsigned long unshare_flags)
new_cred, new_fs);
if (err)
goto bad_unshare_cleanup_cred;
-
if (new_cred) {
err = set_cred_ucounts(new_cred);
if (err)
- goto bad_unshare_cleanup_cred;
+ goto bad_unshare_cleanup_nsproxy;
}
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
@@ -3195,8 +3265,10 @@ int ksys_unshare(unsigned long unshare_flags)
shm_init_task(current);
}
- if (new_nsproxy)
+ if (new_nsproxy) {
switch_task_namespaces(current, new_nsproxy);
+ new_nsproxy = NULL;
+ }
task_lock(current);
@@ -3225,13 +3297,15 @@ int ksys_unshare(unsigned long unshare_flags)
perf_event_namespaces(current);
+bad_unshare_cleanup_nsproxy:
+ if (new_nsproxy)
+ put_nsproxy(new_nsproxy);
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
-
bad_unshare_cleanup_fs:
if (new_fs)
free_fs_struct(new_fs);
diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile
index b77188d1fa07..dce70f8a322b 100644
--- a/kernel/futex/Makefile
+++ b/kernel/futex/Makefile
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
+CONTEXT_ANALYSIS := y
+
obj-y += core.o syscalls.o pi.o requeue.o waitwake.o
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index cf7e610eac42..ff2a4fb2993f 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -342,7 +342,7 @@ static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
if (!vma)
return FUTEX_NO_NODE;
- mpol = vma_policy(vma);
+ mpol = READ_ONCE(vma->vm_policy);
if (!mpol)
return FUTEX_NO_NODE;
@@ -864,7 +864,6 @@ void __futex_unqueue(struct futex_q *q)
/* The key must be already stored in q->key. */
void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
- __acquires(&hb->lock)
{
/*
* Increment the counter before taking the lock so that
@@ -879,10 +878,10 @@ void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock);
+ __acquire(q->lock_ptr);
}
void futex_q_unlock(struct futex_hash_bucket *hb)
- __releases(&hb->lock)
{
futex_hb_waiters_dec(hb);
spin_unlock(&hb->lock);
@@ -1443,12 +1442,15 @@ static void futex_cleanup(struct task_struct *tsk)
void futex_exit_recursive(struct task_struct *tsk)
{
/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
- if (tsk->futex_state == FUTEX_STATE_EXITING)
+ if (tsk->futex_state == FUTEX_STATE_EXITING) {
+ __assume_ctx_lock(&tsk->futex_exit_mutex);
mutex_unlock(&tsk->futex_exit_mutex);
+ }
tsk->futex_state = FUTEX_STATE_DEAD;
}
static void futex_cleanup_begin(struct task_struct *tsk)
+ __acquires(&tsk->futex_exit_mutex)
{
/*
* Prevent various race issues against a concurrent incoming waiter
@@ -1475,6 +1477,7 @@ static void futex_cleanup_begin(struct task_struct *tsk)
}
static void futex_cleanup_end(struct task_struct *tsk, int state)
+ __releases(&tsk->futex_exit_mutex)
{
/*
* Lockless store. The only side effect is that an observer might
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 30c2afa03889..9f6bf6f585fc 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -217,7 +217,7 @@ enum futex_access {
extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw);
-extern void futex_q_lockptr_lock(struct futex_q *q);
+extern void futex_q_lockptr_lock(struct futex_q *q) __acquires(q->lock_ptr);
extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);
@@ -311,9 +311,11 @@ extern int futex_unqueue(struct futex_q *q);
static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
struct task_struct *task)
__releases(&hb->lock)
+ __releases(q->lock_ptr)
{
__futex_queue(q, hb, task);
spin_unlock(&hb->lock);
+ __release(q->lock_ptr);
}
extern void futex_unqueue_pi(struct futex_q *q);
@@ -358,9 +360,12 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
#endif
}
-extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb);
-extern void futex_q_unlock(struct futex_hash_bucket *hb);
+extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
+ __acquires(&hb->lock)
+ __acquires(q->lock_ptr);
+extern void futex_q_unlock(struct futex_hash_bucket *hb)
+ __releases(&hb->lock);
extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
union futex_key *key,
@@ -379,6 +384,9 @@ extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);
*/
static inline void
double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+ __acquires(&hb1->lock)
+ __acquires(&hb2->lock)
+ __no_context_analysis
{
if (hb1 > hb2)
swap(hb1, hb2);
@@ -390,6 +398,9 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
static inline void
double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+ __releases(&hb1->lock)
+ __releases(&hb2->lock)
+ __no_context_analysis
{
spin_unlock(&hb1->lock);
if (hb1 != hb2)
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index bc1f7e83a37e..643199fdbe62 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -389,6 +389,7 @@ static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
* Initialize the pi_mutex in locked state and make @p
* the owner of it:
*/
+ __assume_ctx_lock(&pi_state->pi_mutex.wait_lock);
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
/* Store the key for possible exit cleanups: */
@@ -614,6 +615,8 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
static int wake_futex_pi(u32 __user *uaddr, u32 uval,
struct futex_pi_state *pi_state,
struct rt_mutex_waiter *top_waiter)
+ __must_hold(&pi_state->pi_mutex.wait_lock)
+ __releases(&pi_state->pi_mutex.wait_lock)
{
struct task_struct *new_owner;
bool postunlock = false;
@@ -670,6 +673,8 @@ out_unlock:
static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
struct task_struct *argowner)
+ __must_hold(&q->pi_state->pi_mutex.wait_lock)
+ __must_hold(q->lock_ptr)
{
struct futex_pi_state *pi_state = q->pi_state;
struct task_struct *oldowner, *newowner;
@@ -918,7 +923,7 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to;
- struct task_struct *exiting = NULL;
+ struct task_struct *exiting;
struct rt_mutex_waiter rt_waiter;
struct futex_q q = futex_q_init;
DEFINE_WAKE_Q(wake_q);
@@ -933,6 +938,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
to = futex_setup_timer(time, &timeout, flags, 0);
retry:
+ exiting = NULL;
ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -966,6 +972,7 @@ retry_private:
* - EAGAIN: The user space value changed.
*/
futex_q_unlock(hb);
+ __release(q.lock_ptr);
/*
* Handle the case where the owner is in the middle of
* exiting. Wait for the exit to complete otherwise
@@ -1090,6 +1097,7 @@ no_block:
if (res)
ret = (res < 0) ? res : 0;
+ __release(&hb->lock);
futex_unqueue_pi(&q);
spin_unlock(q.lock_ptr);
if (q.drop_hb_ref) {
@@ -1101,10 +1109,12 @@ no_block:
out_unlock_put_key:
futex_q_unlock(hb);
+ __release(q.lock_ptr);
goto out;
uaddr_faulted:
futex_q_unlock(hb);
+ __release(q.lock_ptr);
ret = fault_in_user_writeable(uaddr);
if (ret)
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 743c7a728237..77ad9691f6a6 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -459,6 +459,14 @@ SYSCALL_DEFINE4(futex_requeue,
if (ret)
return ret;
+ /*
+ * For now mandate both flags are identical, like the sys_futex()
+ * interface has. If/when we merge the variable sized futex support,
+ * that patch can modify this test to allow a difference in size.
+ */
+ if (futexes[0].w.flags != futexes[1].w.flags)
+ return -EINVAL;
+
cmpval = futexes[0].w.val;
return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 1c2dd03f11ec..ceed9d879059 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -462,6 +462,7 @@ retry:
}
futex_q_unlock(hb);
+ __release(q->lock_ptr);
}
__set_current_state(TASK_RUNNING);
@@ -628,6 +629,7 @@ retry_private:
if (ret) {
futex_q_unlock(hb);
+ __release(q->lock_ptr);
ret = get_user(uval, uaddr);
if (ret)
@@ -641,11 +643,13 @@ retry_private:
if (uval != val) {
futex_q_unlock(hb);
+ __release(q->lock_ptr);
return -EWOULDBLOCK;
}
if (key2 && futex_match(&q->key, key2)) {
futex_q_unlock(hb);
+ __release(q->lock_ptr);
return -EINVAL;
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d2254c91450b..6fcc94ce4ca9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -36,7 +36,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Total number of tasks detected as hung since boot:
*/
-static unsigned long __read_mostly sysctl_hung_task_detect_count;
+static atomic_long_t sysctl_hung_task_detect_count = ATOMIC_LONG_INIT(0);
/*
* Limit number of tasks checked in a batch.
@@ -223,37 +223,36 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti
}
#endif
-static void check_hung_task(struct task_struct *t, unsigned long timeout,
- unsigned long prev_detect_count)
+/**
+ * hung_task_info - Print diagnostic details for a hung task
+ * @t: Pointer to the detected hung task.
+ * @timeout: Timeout threshold for detecting hung tasks
+ * @this_round_count: Count of hung tasks detected in the current iteration
+ *
+ * Print structured information about the specified hung task, if warnings
+ * are enabled or if the panic batch threshold is exceeded.
+ */
+static void hung_task_info(struct task_struct *t, unsigned long timeout,
+ unsigned long this_round_count)
{
- unsigned long total_hung_task;
-
- if (!task_is_hung(t, timeout))
- return;
-
- /*
- * This counter tracks the total number of tasks detected as hung
- * since boot.
- */
- sysctl_hung_task_detect_count++;
-
- total_hung_task = sysctl_hung_task_detect_count - prev_detect_count;
trace_sched_process_hang(t);
- if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) {
+ if (sysctl_hung_task_panic && this_round_count >= sysctl_hung_task_panic) {
console_verbose();
hung_task_call_panic = true;
}
/*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
+ * The given task did not get scheduled for more than
+ * CONFIG_DEFAULT_HUNG_TASK_TIMEOUT. Therefore, complain
+ * accordingly
*/
if (sysctl_hung_task_warnings || hung_task_call_panic) {
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
- pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
- t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
+ pr_err("INFO: task %s:%d blocked%s for more than %ld seconds.\n",
+ t->comm, t->pid, t->in_iowait ? " in I/O wait" : "",
+ (jiffies - t->last_switch_time) / HZ);
pr_err(" %s %s %.*s\n",
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
@@ -297,15 +296,14 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
+ * a really long time. If that happens, print out a warning.
*/
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
struct task_struct *g, *t;
- unsigned long prev_detect_count = sysctl_hung_task_detect_count;
+ unsigned long this_round_count;
int need_warning = sysctl_hung_task_warnings;
unsigned long si_mask = hung_task_si_mask;
@@ -316,10 +314,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;
-
+ this_round_count = 0;
rcu_read_lock();
for_each_process_thread(g, t) {
-
if (!max_count--)
goto unlock;
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
@@ -328,12 +325,22 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
last_break = jiffies;
}
- check_hung_task(t, timeout, prev_detect_count);
+ if (task_is_hung(t, timeout)) {
+ /*
+ * Increment the global counter so that userspace could
+ * start migrating tasks ASAP. But count the current
+ * round separately because userspace could reset
+ * the global counter at any time.
+ */
+ atomic_long_inc(&sysctl_hung_task_detect_count);
+ this_round_count++;
+ hung_task_info(t, timeout, this_round_count);
+ }
}
unlock:
rcu_read_unlock();
- if (!(sysctl_hung_task_detect_count - prev_detect_count))
+ if (!this_round_count)
return;
if (need_warning || hung_task_call_panic) {
@@ -358,6 +365,46 @@ static long hung_timeout_jiffies(unsigned long last_checked,
}
#ifdef CONFIG_SYSCTL
+
+/**
+ * proc_dohung_task_detect_count - proc handler for hung_task_detect_count
+ * @table: Pointer to the struct ctl_table definition for this proc entry
+ * @dir: Flag indicating the operation
+ * @buffer: User space buffer for data transfer
+ * @lenp: Pointer to the length of the data being transferred
+ * @ppos: Pointer to the current file offset
+ *
+ * This handler is used for reading the current hung task detection count
+ * and for resetting it to zero when a write operation is performed using a
+ * zero value only.
+ * Return: 0 on success, or a negative error code on failure.
+ */
+static int proc_dohung_task_detect_count(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned long detect_count;
+ struct ctl_table proxy_table;
+ int err;
+
+ proxy_table = *table;
+ proxy_table.data = &detect_count;
+
+ if (SYSCTL_KERN_TO_USER(dir))
+ detect_count = atomic_long_read(&sysctl_hung_task_detect_count);
+
+ err = proc_doulongvec_minmax(&proxy_table, dir, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (SYSCTL_USER_TO_KERN(dir)) {
+ if (detect_count)
+ return -EINVAL;
+ atomic_long_set(&sysctl_hung_task_detect_count, 0);
+ }
+
+ return 0;
+}
+
/*
* Process updating of timeout sysctl
*/
@@ -438,10 +485,9 @@ static const struct ctl_table hung_task_sysctls[] = {
},
{
.procname = "hung_task_detect_count",
- .data = &sysctl_hung_task_detect_count,
.maxlen = sizeof(unsigned long),
- .mode = 0444,
- .proc_handler = proc_doulongvec_minmax,
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_detect_count,
},
{
.procname = "hung_task_sys_info",
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 85c45cfe7223..78f2418a8925 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -115,13 +115,10 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
if (resv > minvec)
return 0;
- if (affd->calc_sets) {
+ if (affd->calc_sets)
set_vecs = maxvec - resv;
- } else {
- cpus_read_lock();
+ else
set_vecs = cpumask_weight(cpu_possible_mask);
- cpus_read_unlock();
- }
return resv + min(set_vecs, maxvec - resv);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6147a07d0127..6c9b1dc4e7d4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/irqdomain.h>
+#include <linux/random.h>
#include <trace/events/irq.h>
@@ -929,6 +930,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
enabled ? " and unmasked" : "", irq, cpu);
}
+ add_interrupt_randomness(irq);
+
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
}
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 0f79a4abea05..faafb43a4e61 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -39,7 +39,7 @@ struct irq_matrix {
/**
* irq_alloc_matrix - Allocate a irq_matrix structure and initialize it
- * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS
+ * @matrix_bits: Number of matrix bits
* @alloc_start: From which bit the allocation search starts
* @alloc_end: At which bit the allocation search ends, i.e first
* invalid bit
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 73f7e1fd4ab4..120fd7365fbe 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -79,7 +79,7 @@ void __weak arch_irq_work_raise(void)
static __always_inline void irq_work_raise(struct irq_work *work)
{
if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
- trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
+ trace_call__ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
arch_irq_work_raise();
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 7cb19e601426..e851e4b37d0e 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -529,15 +529,6 @@ void __init jump_label_init(void)
struct static_key *key = NULL;
struct jump_entry *iter;
- /*
- * Since we are initializing the static_key.enabled field with
- * with the 'raw' int values (to avoid pulling in atomic.h) in
- * jump_label.h, let's make sure that is safe. There are only two
- * cases to check since we initialize to 0 or 1.
- */
- BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
- BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
-
if (static_key_initialized)
return;
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 79e655ea4ca1..ae758150ccb9 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -168,7 +168,7 @@ static bool __report_matches(const struct expect_report *r)
if (!report_available())
return false;
- expect = kmalloc_obj(observed.lines);
+ expect = (typeof(expect))kmalloc_obj(observed.lines);
if (WARN_ON(!expect))
return false;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 2fea396d29b9..a43d2da0fe3e 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -47,7 +47,6 @@
#include <asm/page.h>
#include <asm/sections.h>
-#include <crypto/hash.h>
#include "kexec_internal.h"
atomic_t __kexec_lock = ATOMIC_INIT(0);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ab25b4aa9095..bfc89083daa9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1144,12 +1144,12 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
lockdep_assert_held(&kprobe_mutex);
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
- if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
+ if (ret < 0)
return ret;
if (*cnt == 0) {
ret = register_ftrace_function(ops);
- if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) {
+ if (ret < 0) {
/*
* At this point, sinec ops is not registered, we should be sefe from
* registering empty filter.
@@ -1178,6 +1178,10 @@ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int ret;
lockdep_assert_held(&kprobe_mutex);
+ if (unlikely(kprobe_ftrace_disabled)) {
+ /* Now ftrace is disabled forever, disarm is already done. */
+ return 0;
+ }
if (*cnt == 1) {
ret = unregister_ftrace_function(ops);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index a9e6354d9e25..f45ade718054 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -8,6 +8,7 @@
#include <asm/byteorder.h>
#include <linux/kobject.h>
+#include <linux/ksysfs.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/export.h>
@@ -213,7 +214,7 @@ static const struct attribute_group kernel_attr_group = {
.attrs = kernel_attrs,
};
-static int __init ksysfs_init(void)
+void __init ksysfs_init(void)
{
int error;
@@ -234,14 +235,12 @@ static int __init ksysfs_init(void)
goto group_exit;
}
- return 0;
+ return;
group_exit:
sysfs_remove_group(kernel_kobj, &kernel_attr_group);
kset_exit:
kobject_put(kernel_kobj);
exit:
- return error;
+ pr_err("failed to initialize the kernel kobject: %d\n", error);
}
-
-core_initcall(ksysfs_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 20451b624b67..791210daf8b4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -85,24 +85,6 @@ static inline struct kthread *to_kthread(struct task_struct *k)
return k->worker_private;
}
-/*
- * Variant of to_kthread() that doesn't assume @p is a kthread.
- *
- * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
- * always remain a kthread. For kthreads p->worker_private always
- * points to a struct kthread. For tasks that are not kthreads
- * p->worker_private is used to point to other things.
- *
- * Return NULL for any task that is not a kthread.
- */
-static inline struct kthread *__to_kthread(struct task_struct *p)
-{
- void *kthread = p->worker_private;
- if (kthread && !(p->flags & PF_KTHREAD))
- kthread = NULL;
- return kthread;
-}
-
void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
struct kthread *kthread = to_kthread(tsk);
@@ -193,7 +175,7 @@ EXPORT_SYMBOL_GPL(kthread_should_park);
bool kthread_should_stop_or_park(void)
{
- struct kthread *kthread = __to_kthread(current);
+ struct kthread *kthread = tsk_is_kthread(current);
if (!kthread)
return false;
@@ -234,7 +216,7 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
*/
void *kthread_func(struct task_struct *task)
{
- struct kthread *kthread = __to_kthread(task);
+ struct kthread *kthread = tsk_is_kthread(task);
if (kthread)
return kthread->threadfn;
return NULL;
@@ -266,7 +248,7 @@ EXPORT_SYMBOL_GPL(kthread_data);
*/
void *kthread_probe_data(struct task_struct *task)
{
- struct kthread *kthread = __to_kthread(task);
+ struct kthread *kthread = tsk_is_kthread(task);
void *data = NULL;
if (kthread)
@@ -309,19 +291,8 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
-/**
- * kthread_exit - Cause the current kthread return @result to kthread_stop().
- * @result: The integer value to return to kthread_stop().
- *
- * While kthread_exit can be called directly, it exists so that
- * functions which do some additional work in non-modular code such as
- * module_put_and_kthread_exit can be implemented.
- *
- * Does not return.
- */
-void __noreturn kthread_exit(long result)
+void kthread_do_exit(struct kthread *kthread, long result)
{
- struct kthread *kthread = to_kthread(current);
kthread->result = result;
if (!list_empty(&kthread->affinity_node)) {
mutex_lock(&kthread_affinity_lock);
@@ -333,9 +304,7 @@ void __noreturn kthread_exit(long result)
kthread->preferred_affinity = NULL;
}
}
- do_exit(0);
}
-EXPORT_SYMBOL(kthread_exit);
/**
* kthread_complete_and_exit - Exit the current kthread.
@@ -683,7 +652,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
bool kthread_is_per_cpu(struct task_struct *p)
{
- struct kthread *kthread = __to_kthread(p);
+ struct kthread *kthread = tsk_is_kthread(p);
if (!kthread)
return false;
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index cc68a3692905..532f455c5d4f 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -5,6 +5,7 @@
* Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
* Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
* Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
+ * Copyright (C) 2026 Google LLC, Jason Miu <jasonmiu@google.com>
*/
#define pr_fmt(fmt) "KHO: " fmt
@@ -13,8 +14,10 @@
#include <linux/cma.h>
#include <linux/kmemleak.h>
#include <linux/count_zeros.h>
+#include <linux/kasan.h>
#include <linux/kexec.h>
#include <linux/kexec_handover.h>
+#include <linux/kho_radix_tree.h>
#include <linux/kho/abi/kexec_handover.h>
#include <linux/libfdt.h>
#include <linux/list.h>
@@ -64,163 +67,316 @@ static int __init kho_parse_enable(char *p)
}
early_param("kho", kho_parse_enable);
-/*
- * Keep track of memory that is to be preserved across KHO.
- *
- * The serializing side uses two levels of xarrays to manage chunks of per-order
- * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order
- * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0
- * allocations each bitmap will cover 128M of address space. Thus, for 16G of
- * memory at most 512K of bitmap memory will be needed for order 0.
- *
- * This approach is fully incremental, as the serialization progresses folios
- * can continue be aggregated to the tracker. The final step, immediately prior
- * to kexec would serialize the xarray information into a linked list for the
- * successor kernel to parse.
- */
-
-#define PRESERVE_BITS (PAGE_SIZE * 8)
-
-struct kho_mem_phys_bits {
- DECLARE_BITMAP(preserve, PRESERVE_BITS);
-};
-
-static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE);
-
-struct kho_mem_phys {
- /*
- * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
- * to order.
- */
- struct xarray phys_bits;
-};
-
-struct kho_mem_track {
- /* Points to kho_mem_phys, each order gets its own bitmap tree */
- struct xarray orders;
-};
-
-struct khoser_mem_chunk;
-
struct kho_out {
void *fdt;
- bool finalized;
- struct mutex lock; /* protects KHO FDT finalization */
+ struct mutex lock; /* protects KHO FDT */
- struct kho_mem_track track;
+ struct kho_radix_tree radix_tree;
struct kho_debugfs dbg;
};
static struct kho_out kho_out = {
.lock = __MUTEX_INITIALIZER(kho_out.lock),
- .track = {
- .orders = XARRAY_INIT(kho_out.track.orders, 0),
+ .radix_tree = {
+ .lock = __MUTEX_INITIALIZER(kho_out.radix_tree.lock),
},
- .finalized = false,
};
-static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
+/**
+ * kho_radix_encode_key - Encodes a physical address and order into a radix key.
+ * @phys: The physical address of the page.
+ * @order: The order of the page.
+ *
+ * This function combines a page's physical address and its order into a
+ * single unsigned long, which is used as a key for all radix tree
+ * operations.
+ *
+ * Return: The encoded unsigned long radix key.
+ */
+static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order)
{
- void *res = xa_load(xa, index);
+ /* Order bits part */
+ unsigned long h = 1UL << (KHO_ORDER_0_LOG2 - order);
+ /* Shifted physical address part */
+ unsigned long l = phys >> (PAGE_SHIFT + order);
- if (res)
- return res;
+ return h | l;
+}
- void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
+/**
+ * kho_radix_decode_key - Decodes a radix key back into a physical address and order.
+ * @key: The unsigned long key to decode.
+ * @order: An output parameter, a pointer to an unsigned int where the decoded
+ * page order will be stored.
+ *
+ * This function reverses the encoding performed by kho_radix_encode_key(),
+ * extracting the original physical address and page order from a given key.
+ *
+ * Return: The decoded physical address.
+ */
+static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order)
+{
+ unsigned int order_bit = fls64(key);
+ phys_addr_t phys;
- if (!elm)
- return ERR_PTR(-ENOMEM);
+ /* order_bit is numbered starting at 1 from fls64 */
+ *order = KHO_ORDER_0_LOG2 - order_bit + 1;
+ /* The order is discarded by the shift */
+ phys = key << (PAGE_SHIFT + *order);
- if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
- return ERR_PTR(-EINVAL);
+ return phys;
+}
- res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
- if (xa_is_err(res))
- return ERR_PTR(xa_err(res));
- else if (res)
- return res;
+static unsigned long kho_radix_get_bitmap_index(unsigned long key)
+{
+ return key % (1 << KHO_BITMAP_SIZE_LOG2);
+}
+
+static unsigned long kho_radix_get_table_index(unsigned long key,
+ unsigned int level)
+{
+ int s;
- return no_free_ptr(elm);
+ s = ((level - 1) * KHO_TABLE_SIZE_LOG2) + KHO_BITMAP_SIZE_LOG2;
+ return (key >> s) % (1 << KHO_TABLE_SIZE_LOG2);
}
-static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
- unsigned int order)
+/**
+ * kho_radix_add_page - Marks a page as preserved in the radix tree.
+ * @tree: The KHO radix tree.
+ * @pfn: The page frame number of the page to preserve.
+ * @order: The order of the page.
+ *
+ * This function traverses the radix tree based on the key derived from @pfn
+ * and @order. It sets the corresponding bit in the leaf bitmap to mark the
+ * page for preservation. If intermediate nodes do not exist along the path,
+ * they are allocated and added to the tree.
+ *
+ * Return: 0 on success, or a negative error code on failure.
+ */
+int kho_radix_add_page(struct kho_radix_tree *tree,
+ unsigned long pfn, unsigned int order)
{
- struct kho_mem_phys_bits *bits;
- struct kho_mem_phys *physxa;
- const unsigned long pfn_high = pfn >> order;
+ /* Newly allocated nodes for error cleanup */
+ struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 };
+ unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
+ struct kho_radix_node *anchor_node = NULL;
+ struct kho_radix_node *node = tree->root;
+ struct kho_radix_node *new_node;
+ unsigned int i, idx, anchor_idx;
+ struct kho_radix_leaf *leaf;
+ int err = 0;
- physxa = xa_load(&track->orders, order);
- if (WARN_ON_ONCE(!physxa))
- return;
+ if (WARN_ON_ONCE(!tree->root))
+ return -EINVAL;
- bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
- if (WARN_ON_ONCE(!bits))
- return;
+ might_sleep();
+
+ guard(mutex)(&tree->lock);
+
+ /* Go from high levels to low levels */
+ for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
+ idx = kho_radix_get_table_index(key, i);
+
+ if (node->table[idx]) {
+ node = phys_to_virt(node->table[idx]);
+ continue;
+ }
+
+ /* Next node is empty, create a new node for it */
+ new_node = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL);
+ if (!new_node) {
+ err = -ENOMEM;
+ goto err_free_nodes;
+ }
+
+ node->table[idx] = virt_to_phys(new_node);
+
+ /*
+ * Capture the node where the new branch starts for cleanup
+ * if allocation fails.
+ */
+ if (!anchor_node) {
+ anchor_node = node;
+ anchor_idx = idx;
+ }
+ intermediate_nodes[i] = new_node;
+
+ node = new_node;
+ }
+
+ /* Handle the leaf level bitmap (level 0) */
+ idx = kho_radix_get_bitmap_index(key);
+ leaf = (struct kho_radix_leaf *)node;
+ __set_bit(idx, leaf->bitmap);
- clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+ return 0;
+
+err_free_nodes:
+ for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
+ if (intermediate_nodes[i])
+ free_page((unsigned long)intermediate_nodes[i]);
+ }
+ if (anchor_node)
+ anchor_node->table[anchor_idx] = 0;
+
+ return err;
}
+EXPORT_SYMBOL_GPL(kho_radix_add_page);
-static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
- unsigned long end_pfn)
+/**
+ * kho_radix_del_page - Removes a page's preservation status from the radix tree.
+ * @tree: The KHO radix tree.
+ * @pfn: The page frame number of the page to unpreserve.
+ * @order: The order of the page.
+ *
+ * This function traverses the radix tree and clears the bit corresponding to
+ * the page, effectively removing its "preserved" status. It does not free
+ * the tree's intermediate nodes, even if they become empty.
+ */
+void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
+ unsigned int order)
{
- unsigned int order;
+ unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
+ struct kho_radix_node *node = tree->root;
+ struct kho_radix_leaf *leaf;
+ unsigned int i, idx;
- while (pfn < end_pfn) {
- order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+ if (WARN_ON_ONCE(!tree->root))
+ return;
- __kho_unpreserve_order(track, pfn, order);
+ might_sleep();
- pfn += 1 << order;
+ guard(mutex)(&tree->lock);
+
+ /* Go from high levels to low levels */
+ for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
+ idx = kho_radix_get_table_index(key, i);
+
+ /*
+ * Attempting to delete a page that has not been preserved,
+ * return with a warning.
+ */
+ if (WARN_ON(!node->table[idx]))
+ return;
+
+ node = phys_to_virt(node->table[idx]);
}
+
+ /* Handle the leaf level bitmap (level 0) */
+ leaf = (struct kho_radix_leaf *)node;
+ idx = kho_radix_get_bitmap_index(key);
+ __clear_bit(idx, leaf->bitmap);
}
+EXPORT_SYMBOL_GPL(kho_radix_del_page);
-static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
- unsigned int order)
+static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf,
+ unsigned long key,
+ kho_radix_tree_walk_callback_t cb)
{
- struct kho_mem_phys_bits *bits;
- struct kho_mem_phys *physxa, *new_physxa;
- const unsigned long pfn_high = pfn >> order;
+ unsigned long *bitmap = (unsigned long *)leaf;
+ unsigned int order;
+ phys_addr_t phys;
+ unsigned int i;
+ int err;
- might_sleep();
- physxa = xa_load(&track->orders, order);
- if (!physxa) {
- int err;
+ for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
+ phys = kho_radix_decode_key(key | i, &order);
+ err = cb(phys, order);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
- new_physxa = kzalloc_obj(*physxa);
- if (!new_physxa)
- return -ENOMEM;
+static int __kho_radix_walk_tree(struct kho_radix_node *root,
+ unsigned int level, unsigned long start,
+ kho_radix_tree_walk_callback_t cb)
+{
+ struct kho_radix_node *node;
+ struct kho_radix_leaf *leaf;
+ unsigned long key, i;
+ unsigned int shift;
+ int err;
+
+ for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
+ if (!root->table[i])
+ continue;
- xa_init(&new_physxa->phys_bits);
- physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
- GFP_KERNEL);
+ shift = ((level - 1) * KHO_TABLE_SIZE_LOG2) +
+ KHO_BITMAP_SIZE_LOG2;
+ key = start | (i << shift);
- err = xa_err(physxa);
- if (err || physxa) {
- xa_destroy(&new_physxa->phys_bits);
- kfree(new_physxa);
+ node = phys_to_virt(root->table[i]);
- if (err)
- return err;
+ if (level == 1) {
+ /*
+ * we are at level 1,
+ * node is pointing to the level 0 bitmap.
+ */
+ leaf = (struct kho_radix_leaf *)node;
+ err = kho_radix_walk_leaf(leaf, key, cb);
} else {
- physxa = new_physxa;
+ err = __kho_radix_walk_tree(node, level - 1,
+ key, cb);
}
+
+ if (err)
+ return err;
}
- bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
- if (IS_ERR(bits))
- return PTR_ERR(bits);
+ return 0;
+}
+
+/**
+ * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page.
+ * @tree: A pointer to the KHO radix tree to walk.
+ * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be
+ * invoked for each preserved page found in the tree. The callback receives
+ * the physical address and order of the preserved page.
+ *
+ * This function walks the radix tree, searching from the specified top level
+ * down to the lowest level (level 0). For each preserved page found, it invokes
+ * the provided callback, passing the page's physical address and order.
+ *
+ * Return: 0 if the walk completed the specified tree, or the non-zero return
+ * value from the callback that stopped the walk.
+ */
+int kho_radix_walk_tree(struct kho_radix_tree *tree,
+ kho_radix_tree_walk_callback_t cb)
+{
+ if (WARN_ON_ONCE(!tree->root))
+ return -EINVAL;
+
+ guard(mutex)(&tree->lock);
- set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+ return __kho_radix_walk_tree(tree->root, KHO_TREE_MAX_DEPTH - 1, 0, cb);
+}
+EXPORT_SYMBOL_GPL(kho_radix_walk_tree);
- return 0;
+static void __kho_unpreserve(struct kho_radix_tree *tree,
+ unsigned long pfn, unsigned long end_pfn)
+{
+ unsigned int order;
+
+ while (pfn < end_pfn) {
+ order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ kho_radix_del_page(tree, pfn, order);
+
+ pfn += 1 << order;
+ }
}
/* For physically contiguous 0-order pages. */
static void kho_init_pages(struct page *page, unsigned long nr_pages)
{
- for (unsigned long i = 0; i < nr_pages; i++)
+ for (unsigned long i = 0; i < nr_pages; i++) {
set_page_count(page + i, 1);
+ /* Clear each page's codetag to avoid accounting mismatch. */
+ clear_page_tag_ref(page + i);
+ }
}
static void kho_init_folio(struct page *page, unsigned int order)
@@ -229,6 +385,8 @@ static void kho_init_folio(struct page *page, unsigned int order)
/* Head page gets refcount of 1. */
set_page_count(page, 1);
+ /* Clear head page's codetag to avoid accounting mismatch. */
+ clear_page_tag_ref(page);
/* For higher order folios, tail pages get a page count of zero. */
for (unsigned long i = 1; i < nr_pages; i++)
@@ -253,7 +411,7 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
* check also implicitly makes sure phys is order-aligned since for
* non-order-aligned phys addresses, magic will never be set.
*/
- if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER))
+ if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC))
return NULL;
nr_pages = (1 << info.order);
@@ -265,14 +423,6 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
else
kho_init_pages(page, nr_pages);
- /* Always mark headpage's codetag as empty to avoid accounting mismatch */
- clear_page_tag_ref(page);
- if (!is_folio) {
- /* Also do that for the non-compound tail pages */
- for (unsigned int i = 1; i < nr_pages; i++)
- clear_page_tag_ref(page + i);
- }
-
adjust_managed_page_count(page, nr_pages);
return page;
}
@@ -321,161 +471,24 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages)
}
EXPORT_SYMBOL_GPL(kho_restore_pages);
-/* Serialize and deserialize struct kho_mem_phys across kexec
- *
- * Record all the bitmaps in a linked list of pages for the next kernel to
- * process. Each chunk holds bitmaps of the same order and each block of bitmaps
- * starts at a given physical address. This allows the bitmaps to be sparse. The
- * xarray is used to store them in a tree while building up the data structure,
- * but the KHO successor kernel only needs to process them once in order.
- *
- * All of this memory is normal kmalloc() memory and is not marked for
- * preservation. The successor kernel will remain isolated to the scratch space
- * until it completes processing this list. Once processed all the memory
- * storing these ranges will be marked as free.
- */
-
-struct khoser_mem_bitmap_ptr {
- phys_addr_t phys_start;
- DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
-};
-
-struct khoser_mem_chunk_hdr {
- DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
- unsigned int order;
- unsigned int num_elms;
-};
-
-#define KHOSER_BITMAP_SIZE \
- ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
- sizeof(struct khoser_mem_bitmap_ptr))
-
-struct khoser_mem_chunk {
- struct khoser_mem_chunk_hdr hdr;
- struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
-};
-
-static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
-
-static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
- unsigned long order)
-{
- struct khoser_mem_chunk *chunk __free(free_page) = NULL;
-
- chunk = (void *)get_zeroed_page(GFP_KERNEL);
- if (!chunk)
- return ERR_PTR(-ENOMEM);
-
- if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE)))
- return ERR_PTR(-EINVAL);
-
- chunk->hdr.order = order;
- if (cur_chunk)
- KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
- return no_free_ptr(chunk);
-}
-
-static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
+static int __init kho_preserved_memory_reserve(phys_addr_t phys,
+ unsigned int order)
{
- struct khoser_mem_chunk *chunk = first_chunk;
-
- while (chunk) {
- struct khoser_mem_chunk *tmp = chunk;
-
- chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
- free_page((unsigned long)tmp);
- }
-}
-
-/*
- * Update memory map property, if old one is found discard it via
- * kho_mem_ser_free().
- */
-static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
-{
- void *ptr;
- u64 phys;
-
- ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL);
-
- /* Check and discard previous memory map */
- phys = get_unaligned((u64 *)ptr);
- if (phys)
- kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys));
-
- /* Update with the new value */
- phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0;
- put_unaligned(phys, (u64 *)ptr);
-}
-
-static int kho_mem_serialize(struct kho_out *kho_out)
-{
- struct khoser_mem_chunk *first_chunk = NULL;
- struct khoser_mem_chunk *chunk = NULL;
- struct kho_mem_phys *physxa;
- unsigned long order;
- int err = -ENOMEM;
-
- xa_for_each(&kho_out->track.orders, order, physxa) {
- struct kho_mem_phys_bits *bits;
- unsigned long phys;
-
- chunk = new_chunk(chunk, order);
- if (IS_ERR(chunk)) {
- err = PTR_ERR(chunk);
- goto err_free;
- }
-
- if (!first_chunk)
- first_chunk = chunk;
-
- xa_for_each(&physxa->phys_bits, phys, bits) {
- struct khoser_mem_bitmap_ptr *elm;
+ union kho_page_info info;
+ struct page *page;
+ u64 sz;
- if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
- chunk = new_chunk(chunk, order);
- if (IS_ERR(chunk)) {
- err = PTR_ERR(chunk);
- goto err_free;
- }
- }
+ sz = 1 << (order + PAGE_SHIFT);
+ page = phys_to_page(phys);
- elm = &chunk->bitmaps[chunk->hdr.num_elms];
- chunk->hdr.num_elms++;
- elm->phys_start = (phys * PRESERVE_BITS)
- << (order + PAGE_SHIFT);
- KHOSER_STORE_PTR(elm->bitmap, bits);
- }
- }
-
- kho_update_memory_map(first_chunk);
+ /* Reserve the memory preserved in KHO in memblock */
+ memblock_reserve(phys, sz);
+ memblock_reserved_mark_noinit(phys, sz);
+ info.magic = KHO_PAGE_MAGIC;
+ info.order = order;
+ page->private = info.page_private;
return 0;
-
-err_free:
- kho_mem_ser_free(first_chunk);
- return err;
-}
-
-static void __init deserialize_bitmap(unsigned int order,
- struct khoser_mem_bitmap_ptr *elm)
-{
- struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
- unsigned long bit;
-
- for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
- int sz = 1 << (order + PAGE_SHIFT);
- phys_addr_t phys =
- elm->phys_start + (bit << (order + PAGE_SHIFT));
- struct page *page = phys_to_page(phys);
- union kho_page_info info;
-
- memblock_reserve(phys, sz);
- memblock_reserved_mark_noinit(phys, sz);
- info.magic = KHO_PAGE_MAGIC;
- info.order = order;
- page->private = info.page_private;
- }
}
/* Returns physical address of the preserved memory map from FDT */
@@ -486,25 +499,13 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
if (!mem_ptr || len != sizeof(u64)) {
- pr_err("failed to get preserved memory bitmaps\n");
+ pr_err("failed to get preserved memory map\n");
return 0;
}
return get_unaligned((const u64 *)mem_ptr);
}
-static void __init kho_mem_deserialize(struct khoser_mem_chunk *chunk)
-{
- while (chunk) {
- unsigned int i;
-
- for (i = 0; i != chunk->hdr.num_elms; i++)
- deserialize_bitmap(chunk->hdr.order,
- &chunk->bitmaps[i]);
- chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
- }
-}
-
/*
* With KHO enabled, memory can become fragmented because KHO regions may
* be anywhere in physical address space. The scratch regions give us a
@@ -815,14 +816,14 @@ EXPORT_SYMBOL_GPL(kho_remove_subtree);
*/
int kho_preserve_folio(struct folio *folio)
{
+ struct kho_radix_tree *tree = &kho_out.radix_tree;
const unsigned long pfn = folio_pfn(folio);
const unsigned int order = folio_order(folio);
- struct kho_mem_track *track = &kho_out.track;
if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
return -EINVAL;
- return __kho_preserve_order(track, pfn, order);
+ return kho_radix_add_page(tree, pfn, order);
}
EXPORT_SYMBOL_GPL(kho_preserve_folio);
@@ -836,11 +837,11 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio);
*/
void kho_unpreserve_folio(struct folio *folio)
{
+ struct kho_radix_tree *tree = &kho_out.radix_tree;
const unsigned long pfn = folio_pfn(folio);
const unsigned int order = folio_order(folio);
- struct kho_mem_track *track = &kho_out.track;
- __kho_unpreserve_order(track, pfn, order);
+ kho_radix_del_page(tree, pfn, order);
}
EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
@@ -856,7 +857,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
*/
int kho_preserve_pages(struct page *page, unsigned long nr_pages)
{
- struct kho_mem_track *track = &kho_out.track;
+ struct kho_radix_tree *tree = &kho_out.radix_tree;
const unsigned long start_pfn = page_to_pfn(page);
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn = start_pfn;
@@ -869,10 +870,18 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages)
}
while (pfn < end_pfn) {
- const unsigned int order =
+ unsigned int order =
min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
- err = __kho_preserve_order(track, pfn, order);
+ /*
+ * Make sure all the pages in a single preservation are in the
+ * same NUMA node. The restore machinery can not cope with a
+ * preservation spanning multiple NUMA nodes.
+ */
+ while (pfn_to_nid(pfn) != pfn_to_nid(pfn + (1UL << order) - 1))
+ order--;
+
+ err = kho_radix_add_page(tree, pfn, order);
if (err) {
failed_pfn = pfn;
break;
@@ -882,7 +891,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages)
}
if (err)
- __kho_unpreserve(track, start_pfn, failed_pfn);
+ __kho_unpreserve(tree, start_pfn, failed_pfn);
return err;
}
@@ -900,11 +909,11 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages);
*/
void kho_unpreserve_pages(struct page *page, unsigned long nr_pages)
{
- struct kho_mem_track *track = &kho_out.track;
+ struct kho_radix_tree *tree = &kho_out.radix_tree;
const unsigned long start_pfn = page_to_pfn(page);
const unsigned long end_pfn = start_pfn + nr_pages;
- __kho_unpreserve(track, start_pfn, end_pfn);
+ __kho_unpreserve(tree, start_pfn, end_pfn);
}
EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
@@ -963,14 +972,14 @@ err_free:
static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
unsigned short order)
{
- struct kho_mem_track *track = &kho_out.track;
+ struct kho_radix_tree *tree = &kho_out.radix_tree;
unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
- __kho_unpreserve(track, pfn, pfn + 1);
+ __kho_unpreserve(tree, pfn, pfn + 1);
for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
pfn = PHYS_PFN(chunk->phys[i]);
- __kho_unpreserve(track, pfn, pfn + (1 << order));
+ __kho_unpreserve(tree, pfn, pfn + (1 << order));
}
}
@@ -1077,6 +1086,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
{
struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+ kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_PROT_NORMAL;
unsigned int align, order, shift, vm_flags;
unsigned long total_pages, contig_pages;
unsigned long addr, size;
@@ -1128,7 +1138,8 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
goto err_free_pages_array;
area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift,
- vm_flags, VMALLOC_START, VMALLOC_END,
+ vm_flags | VM_UNINITIALIZED,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
if (!area)
@@ -1143,6 +1154,13 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
area->nr_pages = total_pages;
area->pages = pages;
+ if (vm_flags & VM_ALLOC)
+ kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
+
+ area->addr = kasan_unpoison_vmalloc(area->addr, total_pages * PAGE_SIZE,
+ kasan_flags);
+ clear_vm_uninitialized_flag(area);
+
return area->addr;
err_free_vm_area:
@@ -1239,33 +1257,9 @@ void kho_restore_free(void *mem)
}
EXPORT_SYMBOL_GPL(kho_restore_free);
-int kho_finalize(void)
-{
- int ret;
-
- if (!kho_enable)
- return -EOPNOTSUPP;
-
- guard(mutex)(&kho_out.lock);
- ret = kho_mem_serialize(&kho_out);
- if (ret)
- return ret;
-
- kho_out.finalized = true;
-
- return 0;
-}
-
-bool kho_finalized(void)
-{
- guard(mutex)(&kho_out.lock);
- return kho_out.finalized;
-}
-
struct kho_in {
phys_addr_t fdt_phys;
phys_addr_t scratch_phys;
- phys_addr_t mem_map_phys;
struct kho_debugfs dbg;
};
@@ -1333,18 +1327,46 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
}
EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
+static int __init kho_mem_retrieve(const void *fdt)
+{
+ struct kho_radix_tree tree;
+ const phys_addr_t *mem;
+ int len;
+
+ /* Retrieve the KHO radix tree from passed-in FDT. */
+ mem = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
+
+ if (!mem || len != sizeof(*mem)) {
+ pr_err("failed to get preserved KHO memory tree\n");
+ return -ENOENT;
+ }
+
+ if (!*mem)
+ return -EINVAL;
+
+ tree.root = phys_to_virt(*mem);
+ mutex_init(&tree.lock);
+ return kho_radix_walk_tree(&tree, kho_preserved_memory_reserve);
+}
+
static __init int kho_out_fdt_setup(void)
{
+ struct kho_radix_tree *tree = &kho_out.radix_tree;
void *root = kho_out.fdt;
- u64 empty_mem_map = 0;
+ u64 preserved_mem_tree_pa;
int err;
err = fdt_create(root, PAGE_SIZE);
err |= fdt_finish_reservemap(root);
err |= fdt_begin_node(root, "");
err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
- err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map,
- sizeof(empty_mem_map));
+
+ preserved_mem_tree_pa = virt_to_phys(tree->root);
+
+ err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME,
+ &preserved_mem_tree_pa,
+ sizeof(preserved_mem_tree_pa));
+
err |= fdt_end_node(root);
err |= fdt_finish(root);
@@ -1353,16 +1375,23 @@ static __init int kho_out_fdt_setup(void)
static __init int kho_init(void)
{
+ struct kho_radix_tree *tree = &kho_out.radix_tree;
const void *fdt = kho_get_fdt();
int err = 0;
if (!kho_enable)
return 0;
+ tree->root = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!tree->root) {
+ err = -ENOMEM;
+ goto err_free_scratch;
+ }
+
kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
if (IS_ERR(kho_out.fdt)) {
err = PTR_ERR(kho_out.fdt);
- goto err_free_scratch;
+ goto err_free_kho_radix_tree_root;
}
err = kho_debugfs_init();
@@ -1408,6 +1437,9 @@ static __init int kho_init(void)
err_free_fdt:
kho_unpreserve_free(kho_out.fdt);
+err_free_kho_radix_tree_root:
+ kfree(tree->root);
+ tree->root = NULL;
err_free_scratch:
kho_out.fdt = NULL;
for (int i = 0; i < kho_scratch_cnt; i++) {
@@ -1447,10 +1479,12 @@ static void __init kho_release_scratch(void)
void __init kho_memory_init(void)
{
- if (kho_in.mem_map_phys) {
+ if (kho_in.scratch_phys) {
kho_scratch = phys_to_virt(kho_in.scratch_phys);
kho_release_scratch();
- kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys));
+
+ if (kho_mem_retrieve(kho_get_fdt()))
+ kho_in.fdt_phys = 0;
} else {
kho_reserve_scratch();
}
@@ -1528,7 +1562,6 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
kho_in.fdt_phys = fdt_phys;
kho_in.scratch_phys = scratch_phys;
- kho_in.mem_map_phys = mem_map_phys;
kho_scratch_cnt = scratch_cnt;
populated = true;
diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c
index 2f93939168ab..acf368222682 100644
--- a/kernel/liveupdate/kexec_handover_debugfs.c
+++ b/kernel/liveupdate/kexec_handover_debugfs.c
@@ -13,6 +13,7 @@
#include <linux/io.h>
#include <linux/libfdt.h>
#include <linux/mm.h>
+#include <linux/kho/abi/kexec_handover.h>
#include "kexec_handover_internal.h"
static struct dentry *debugfs_root;
@@ -75,24 +76,6 @@ void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt)
}
}
-static int kho_out_finalize_get(void *data, u64 *val)
-{
- *val = kho_finalized();
-
- return 0;
-}
-
-static int kho_out_finalize_set(void *data, u64 val)
-{
- if (val)
- return kho_finalize();
- else
- return -EINVAL;
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get,
- kho_out_finalize_set, "%llu\n");
-
static int scratch_phys_show(struct seq_file *m, void *v)
{
for (int i = 0; i < kho_scratch_cnt; i++)
@@ -139,7 +122,7 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt)
const char *name = fdt_get_name(fdt, child, NULL);
const u64 *fdt_phys;
- fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
+ fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len);
if (!fdt_phys)
continue;
if (len != sizeof(*fdt_phys)) {
@@ -198,11 +181,6 @@ __init int kho_out_debugfs_init(struct kho_debugfs *dbg)
if (IS_ERR(f))
goto err_rmdir;
- f = debugfs_create_file("finalize", 0600, dir, NULL,
- &kho_out_finalize_fops);
- if (IS_ERR(f))
- goto err_rmdir;
-
dbg->dir = dir;
dbg->sub_fdt_dir = sub_fdt_dir;
return 0;
diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h
index 0202c85ad14f..9a832a35254c 100644
--- a/kernel/liveupdate/kexec_handover_internal.h
+++ b/kernel/liveupdate/kexec_handover_internal.h
@@ -22,9 +22,6 @@ struct kho_debugfs {};
extern struct kho_scratch *kho_scratch;
extern unsigned int kho_scratch_cnt;
-bool kho_finalized(void);
-int kho_finalize(void);
-
#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
int kho_debugfs_init(void);
void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt);
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index dda7bb57d421..84ac728d63ba 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -230,17 +230,7 @@ int liveupdate_reboot(void)
luo_flb_serialize();
- err = kho_finalize();
- if (err) {
- pr_err("kho_finalize failed %d\n", err);
- /*
- * kho_finalize() may return libfdt errors, to aboid passing to
- * userspace unknown errors, change this to EAGAIN.
- */
- err = -EAGAIN;
- }
-
- return err;
+ return 0;
}
/**
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index 8c79058253e1..5acee4174bf0 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -134,9 +134,12 @@ static LIST_HEAD(luo_file_handler_list);
* state that is not preserved. Set by the handler's .preserve()
* callback, and must be freed in the handler's .unpreserve()
* callback.
- * @retrieved: A flag indicating whether a user/kernel in the new kernel has
+ * @retrieve_status: Status code indicating whether a user/kernel in the new kernel has
* successfully called retrieve() on this file. This prevents
- * multiple retrieval attempts.
+ * multiple retrieval attempts. A value of 0 means a retrieve()
+ * has not been attempted, a positive value means the retrieve()
+ * was successful, and a negative value means the retrieve()
+ * failed, and the value is the error code of the call.
* @mutex: A mutex that protects the fields of this specific instance
* (e.g., @retrieved, @file), ensuring that operations like
* retrieving or finishing a file are atomic.
@@ -161,7 +164,7 @@ struct luo_file {
struct file *file;
u64 serialized_data;
void *private_data;
- bool retrieved;
+ int retrieve_status;
struct mutex mutex;
struct list_head list;
u64 token;
@@ -298,7 +301,6 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
luo_file->file = file;
luo_file->fh = fh;
luo_file->token = token;
- luo_file->retrieved = false;
mutex_init(&luo_file->mutex);
args.handler = fh;
@@ -577,7 +579,12 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
return -ENOENT;
guard(mutex)(&luo_file->mutex);
- if (luo_file->retrieved) {
+ if (luo_file->retrieve_status < 0) {
+ /* Retrieve was attempted and it failed. Return the error code. */
+ return luo_file->retrieve_status;
+ }
+
+ if (luo_file->retrieve_status > 0) {
/*
* Someone is asking for this file again, so get a reference
* for them.
@@ -590,16 +597,19 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
args.handler = luo_file->fh;
args.serialized_data = luo_file->serialized_data;
err = luo_file->fh->ops->retrieve(&args);
- if (!err) {
- luo_file->file = args.file;
-
- /* Get reference so we can keep this file in LUO until finish */
- get_file(luo_file->file);
- *filep = luo_file->file;
- luo_file->retrieved = true;
+ if (err) {
+ /* Keep the error code for later use. */
+ luo_file->retrieve_status = err;
+ return err;
}
- return err;
+ luo_file->file = args.file;
+ /* Get reference so we can keep this file in LUO until finish */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ luo_file->retrieve_status = 1;
+
+ return 0;
}
static int luo_file_can_finish_one(struct luo_file_set *file_set,
@@ -615,7 +625,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set,
args.handler = luo_file->fh;
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
- args.retrieved = luo_file->retrieved;
+ args.retrieve_status = luo_file->retrieve_status;
can_finish = luo_file->fh->ops->can_finish(&args);
}
@@ -632,7 +642,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
args.handler = luo_file->fh;
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
- args.retrieved = luo_file->retrieved;
+ args.retrieve_status = luo_file->retrieve_status;
luo_file->fh->ops->finish(&args);
luo_flb_file_finish(luo_file->fh);
@@ -788,7 +798,6 @@ int luo_file_deserialize(struct luo_file_set *file_set,
luo_file->file = NULL;
luo_file->serialized_data = file_ser[i].data;
luo_file->token = file_ser[i].token;
- luo_file->retrieved = false;
mutex_init(&luo_file->mutex);
list_add_tail(&luo_file->list, &file_set->files_list);
}
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
index 783677295640..25ae704d7787 100644
--- a/kernel/liveupdate/luo_session.c
+++ b/kernel/liveupdate/luo_session.c
@@ -558,8 +558,13 @@ int luo_session_deserialize(void)
}
scoped_guard(mutex, &session->mutex) {
- luo_file_deserialize(&session->file_set,
- &sh->ser[i].file_set_ser);
+ err = luo_file_deserialize(&session->file_set,
+ &sh->ser[i].file_set_ser);
+ }
+ if (err) {
+ pr_warn("Failed to deserialize files for session [%s] %pe\n",
+ session->name, ERR_PTR(err));
+ return err;
}
}
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index a114949eeed5..cee1901d4cff 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,6 +3,11 @@
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
+CONTEXT_ANALYSIS_mutex.o := y
+CONTEXT_ANALYSIS_rtmutex_api.o := y
+CONTEXT_ANALYSIS_ww_rt_mutex.o := y
+CONTEXT_ANALYSIS_rwsem.o := y
+
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance.
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 2c6b02d4699b..785decd9d0c0 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -37,9 +37,8 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
{
lockdep_assert_held(&lock->wait_lock);
- DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
+ DEBUG_LOCKS_WARN_ON(!lock->first_waiter);
DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
- DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
}
void debug_mutex_free_waiter(struct mutex_waiter *waiter)
@@ -54,15 +53,14 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
lockdep_assert_held(&lock->wait_lock);
/* Current thread can't be already blocked (since it's executing!) */
- DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
+ DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task));
}
void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
- struct mutex *blocked_on = __get_task_blocked_on(task);
+ struct mutex *blocked_on = get_task_blocked_on(task);
- DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
DEBUG_LOCKS_WARN_ON(waiter->task != task);
DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
@@ -74,7 +72,6 @@ void debug_mutex_unlock(struct mutex *lock)
{
if (likely(debug_locks)) {
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
}
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2a1d165b3167..186b463fe326 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -46,8 +46,9 @@
static void __mutex_init_generic(struct mutex *lock)
{
atomic_long_set(&lock->owner, 0);
- raw_spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list);
+ scoped_guard (raw_spinlock_init, &lock->wait_lock) {
+ lock->first_waiter = NULL;
+ }
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
#endif
@@ -150,6 +151,7 @@ EXPORT_SYMBOL(mutex_init_generic);
* follow with a __mutex_trylock() before failing.
*/
static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+ __cond_acquires(true, lock)
{
unsigned long curr = (unsigned long)current;
unsigned long zero = 0UL;
@@ -163,6 +165,7 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
}
static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+ __cond_releases(true, lock)
{
unsigned long curr = (unsigned long)current;
@@ -171,7 +174,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
#else /* !CONFIG_DEBUG_LOCK_ALLOC */
-void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key)
+void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key)
{
__mutex_init_generic(lock);
@@ -181,7 +184,7 @@ void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_k
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
}
-EXPORT_SYMBOL(mutex_init_lockep);
+EXPORT_SYMBOL(mutex_init_lockdep);
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
@@ -194,33 +197,44 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
atomic_long_andnot(flag, &lock->owner);
}
-static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
-{
- return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
-}
-
/*
* Add @waiter to a given location in the lock wait_list and set the
* FLAG_WAITERS flag if it's the first waiter.
*/
static void
__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct list_head *list)
+ struct mutex_waiter *first)
+ __must_hold(&lock->wait_lock)
{
hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX);
debug_mutex_add_waiter(lock, waiter, current);
- list_add_tail(&waiter->list, list);
- if (__mutex_waiter_is_first(lock, waiter))
+ if (!first)
+ first = lock->first_waiter;
+
+ if (first) {
+ list_add_tail(&waiter->list, &first->list);
+ } else {
+ INIT_LIST_HEAD(&waiter->list);
+ lock->first_waiter = waiter;
__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+ }
}
static void
__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+ __must_hold(&lock->wait_lock)
{
- list_del(&waiter->list);
- if (likely(list_empty(&lock->wait_list)))
+ if (list_empty(&waiter->list)) {
__mutex_clear_flag(lock, MUTEX_FLAGS);
+ lock->first_waiter = NULL;
+ } else {
+ if (lock->first_waiter == waiter) {
+ lock->first_waiter = list_first_entry(&waiter->list,
+ struct mutex_waiter, list);
+ }
+ list_del(&waiter->list);
+ }
debug_mutex_remove_waiter(lock, waiter, current);
hung_task_clear_blocker();
@@ -259,7 +273,8 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
* We also put the fastpath first in the kernel image, to make sure the
* branch is predicted by the CPU as default-untaken.
*/
-static void __sched __mutex_lock_slowpath(struct mutex *lock);
+static void __sched __mutex_lock_slowpath(struct mutex *lock)
+ __acquires(lock);
/**
* mutex_lock - acquire the mutex
@@ -340,7 +355,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
* Similarly, stop spinning if we are no longer the
* first waiter.
*/
- if (waiter && !__mutex_waiter_is_first(lock, waiter))
+ if (waiter && data_race(lock->first_waiter != waiter))
return false;
return true;
@@ -525,7 +540,8 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
}
#endif
-static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip);
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
+ __releases(lock);
/**
* mutex_unlock - release the mutex
@@ -565,6 +581,7 @@ EXPORT_SYMBOL(mutex_unlock);
* of a unlocked mutex is not allowed.
*/
void __sched ww_mutex_unlock(struct ww_mutex *lock)
+ __no_context_analysis
{
__ww_mutex_unlock(lock);
mutex_unlock(&lock->base);
@@ -578,6 +595,7 @@ static __always_inline int __sched
__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+ __cond_acquires(0, lock)
{
DEFINE_WAKE_Q(wake_q);
struct mutex_waiter waiter;
@@ -645,7 +663,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
if (!use_ww_ctx) {
/* add waiting tasks to the end of the waitqueue (FIFO): */
- __mutex_add_waiter(lock, &waiter, &lock->wait_list);
+ __mutex_add_waiter(lock, &waiter, NULL);
} else {
/*
* Add in stamp order, waking up waiters that must kill
@@ -656,6 +674,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err_early_kill;
}
+ raw_spin_lock(&current->blocked_lock);
__set_task_blocked_on(current, lock);
set_current_state(state);
trace_contention_begin(lock, LCB_F_MUTEX);
@@ -669,8 +688,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
* the handoff.
*/
if (__mutex_trylock(lock))
- goto acquired;
+ break;
+ raw_spin_unlock(&current->blocked_lock);
/*
* Check for signals and kill conditions while holding
* wait_lock. This ensures the lock cancellation is ordered
@@ -691,14 +711,16 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
schedule_preempt_disabled();
- first = __mutex_waiter_is_first(lock, &waiter);
+ first = lock->first_waiter == &waiter;
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ raw_spin_lock(&current->blocked_lock);
/*
* As we likely have been woken up by task
* that has cleared our blocked_on state, re-set
* it to the lock we are trying to acquire.
*/
- set_task_blocked_on(current, lock);
+ __set_task_blocked_on(current, lock);
set_current_state(state);
/*
* Here we order against unlock; we must either see it change
@@ -709,33 +731,40 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
break;
if (first) {
- trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ bool opt_acquired;
+
/*
* mutex_optimistic_spin() can call schedule(), so
- * clear blocked on so we don't become unselectable
+ * we need to release these locks before calling it,
+ * and clear blocked on so we don't become unselectable
* to run.
*/
- clear_task_blocked_on(current, lock);
- if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ __clear_task_blocked_on(current, lock);
+ raw_spin_unlock(&current->blocked_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter);
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ raw_spin_lock(&current->blocked_lock);
+ __set_task_blocked_on(current, lock);
+
+ if (opt_acquired)
break;
- set_task_blocked_on(current, lock);
trace_contention_begin(lock, LCB_F_MUTEX);
}
-
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-acquired:
__clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
+ raw_spin_unlock(&current->blocked_lock);
if (ww_ctx) {
/*
* Wound-Wait; we stole the lock (!first_waiter), check the
* waiters as anyone might want to wound us.
*/
- if (!ww_ctx->is_wait_die &&
- !__mutex_waiter_is_first(lock, &waiter))
+ if (!ww_ctx->is_wait_die && lock->first_waiter != &waiter)
__ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
}
@@ -756,11 +785,11 @@ skip_wait:
return 0;
err:
- __clear_task_blocked_on(current, lock);
+ clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
- WARN_ON(__get_task_blocked_on(current));
+ WARN_ON(get_task_blocked_on(current));
trace_contention_end(lock, ret);
raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
@@ -772,6 +801,7 @@ err_early_kill:
static int __sched
__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip)
+ __cond_acquires(0, lock)
{
return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
}
@@ -779,6 +809,7 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
static int __sched
__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
unsigned long ip, struct ww_acquire_ctx *ww_ctx)
+ __cond_acquires(0, lock)
{
return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
}
@@ -826,6 +857,7 @@ void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
__mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+ __acquire(lock);
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -834,6 +866,7 @@ void __sched
_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
{
__mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+ __acquire(lock);
}
EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -862,12 +895,14 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
token = io_schedule_prepare();
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
subclass, NULL, _RET_IP_, NULL, 0);
+ __acquire(lock);
io_schedule_finish(token);
}
EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
static inline int
ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ __cond_releases(nonzero, lock)
{
#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
unsigned tmp;
@@ -929,13 +964,16 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
* Release the lock, slowpath:
*/
static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
+ __releases(lock)
{
struct task_struct *next = NULL;
+ struct mutex_waiter *waiter;
DEFINE_WAKE_Q(wake_q);
unsigned long owner;
unsigned long flags;
mutex_release(&lock->dep_map, ip);
+ __release(lock);
/*
* Release the lock before (potentially) taking the spinlock such that
@@ -962,16 +1000,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
raw_spin_lock_irqsave(&lock->wait_lock, flags);
debug_mutex_unlock(lock);
- if (!list_empty(&lock->wait_list)) {
- /* get the first entry from the wait-list: */
- struct mutex_waiter *waiter =
- list_first_entry(&lock->wait_list,
- struct mutex_waiter, list);
-
+ waiter = lock->first_waiter;
+ if (waiter) {
next = waiter->task;
debug_mutex_wake_waiter(lock, waiter);
- __clear_task_blocked_on(next, lock);
+ set_task_blocked_on_waking(next, lock);
wake_q_add(&wake_q, next);
}
@@ -1061,24 +1095,29 @@ EXPORT_SYMBOL_GPL(mutex_lock_io);
static noinline void __sched
__mutex_lock_slowpath(struct mutex *lock)
+ __acquires(lock)
{
__mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+ __acquire(lock);
}
static noinline int __sched
__mutex_lock_killable_slowpath(struct mutex *lock)
+ __cond_acquires(0, lock)
{
return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
__mutex_lock_interruptible_slowpath(struct mutex *lock)
+ __cond_acquires(0, lock)
{
return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ __cond_acquires(0, lock)
{
return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0,
_RET_IP_, ctx);
@@ -1087,6 +1126,7 @@ __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
static noinline int __sched
__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
struct ww_acquire_ctx *ctx)
+ __cond_acquires(0, lock)
{
return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0,
_RET_IP_, ctx);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 9ad4da8cea00..3e263e98e5fc 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -7,6 +7,7 @@
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
#ifndef CONFIG_PREEMPT_RT
+#include <linux/mutex.h>
/*
* This is the control structure for tasks blocked on mutex, which resides
* on the blocked task's kernel stack:
@@ -47,6 +48,12 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock)
return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
}
+static inline struct mutex *get_task_blocked_on(struct task_struct *p)
+{
+ guard(raw_spinlock_irqsave)(&p->blocked_lock);
+ return __get_task_blocked_on(p);
+}
+
#ifdef CONFIG_DEBUG_MUTEXES
extern void debug_mutex_lock_common(struct mutex *lock,
struct mutex_waiter *waiter);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index c80902eacd79..ccaba6148b61 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -94,6 +94,7 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
static __always_inline struct task_struct *
rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
+ __must_hold(&lock->wait_lock)
{
unsigned long val = (unsigned long)owner;
@@ -105,6 +106,7 @@ rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
static __always_inline void
rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+ __must_hold(&lock->wait_lock)
{
/*
* lock->wait_lock is held but explicit acquire semantics are needed
@@ -114,12 +116,14 @@ rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
}
static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock)
{
/* lock->wait_lock is held so the unlock provides release semantics. */
WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
}
static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
@@ -127,6 +131,7 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
static __always_inline void
fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
+ __must_hold(&lock->wait_lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -328,6 +333,7 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
}
static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
@@ -1206,6 +1212,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
enum rtmutex_chainwalk chwalk,
struct wake_q_head *wake_q)
+ __must_hold(&lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
@@ -1249,6 +1256,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
/* Check whether the waiter should back out immediately */
rtm = container_of(lock, struct rt_mutex, rtmutex);
+ __assume_ctx_lock(&rtm->rtmutex.wait_lock);
res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q);
if (res) {
raw_spin_lock(&task->pi_lock);
@@ -1356,6 +1364,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
}
static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock)
{
int ret = try_to_take_rt_mutex(lock, current, NULL);
@@ -1505,7 +1514,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
* - the VCPU on which owner runs is preempted
*/
if (!owner_on_cpu(owner) || need_resched() ||
- !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
+ !data_race(rt_mutex_waiter_is_top_waiter(lock, waiter))) {
res = false;
break;
}
@@ -1538,6 +1547,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
*/
static void __sched remove_waiter(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter)
+ __must_hold(&lock->wait_lock)
{
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
@@ -1613,6 +1623,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
struct task_struct *owner;
int ret = 0;
+ __assume_ctx_lock(&rtm->rtmutex.wait_lock);
+
lockevent_inc(rtmutex_slow_block);
for (;;) {
/* Try to acquire the lock: */
@@ -1658,6 +1670,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
struct rt_mutex_base *lock,
struct rt_mutex_waiter *w)
+ __must_hold(&lock->wait_lock)
{
/*
* If the result is not -EDEADLOCK or the caller requested
@@ -1694,11 +1707,13 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
enum rtmutex_chainwalk chwalk,
struct rt_mutex_waiter *waiter,
struct wake_q_head *wake_q)
+ __must_hold(&lock->wait_lock)
{
struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
struct ww_mutex *ww = ww_container_of(rtm);
int ret;
+ __assume_ctx_lock(&rtm->rtmutex.wait_lock);
lockdep_assert_held(&lock->wait_lock);
lockevent_inc(rtmutex_slowlock);
@@ -1750,6 +1765,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
unsigned int state,
struct wake_q_head *wake_q)
+ __must_hold(&lock->wait_lock)
{
struct rt_mutex_waiter waiter;
int ret;
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 59dbd29cb219..124219aea46e 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -526,6 +526,7 @@ static __always_inline int __mutex_lock_common(struct mutex *lock,
unsigned int subclass,
struct lockdep_map *nest_lock,
unsigned long ip)
+ __acquires(lock) __no_context_analysis
{
int ret;
@@ -647,6 +648,7 @@ EXPORT_SYMBOL(mutex_trylock);
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
void __sched mutex_unlock(struct mutex *lock)
+ __releases(lock) __no_context_analysis
{
mutex_release(&lock->dep_map, _RET_IP_);
__rt_mutex_unlock(&lock->rtmutex);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index cf6ddd1b23a2..c38b7bdea7b3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -79,12 +79,18 @@ struct rt_wake_q_head {
* PI-futex support (proxy locking functions, etc.):
*/
extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
+ struct task_struct *proxy_owner)
+ __must_hold(&lock->wait_lock);
+
+extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock);
+
extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task,
- struct wake_q_head *);
+ struct wake_q_head *)
+ __must_hold(&lock->wait_lock);
+
extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
@@ -94,8 +100,9 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter);
-extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
-extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
+extern int rt_mutex_futex_trylock(struct rt_mutex_base *lock);
+extern int __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock);
extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
@@ -109,6 +116,7 @@ extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
*/
#ifdef CONFIG_RT_MUTEXES
static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock)
{
return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
@@ -120,6 +128,7 @@ static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
*/
static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter)
+ __must_hold(&lock->wait_lock)
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
@@ -127,6 +136,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
}
static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock)
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
@@ -170,9 +180,10 @@ enum rtmutex_chainwalk {
static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
{
- raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT_CACHED;
- lock->owner = NULL;
+ scoped_guard (raw_spinlock_init, &lock->wait_lock) {
+ lock->waiters = RB_ROOT_CACHED;
+ lock->owner = NULL;
+ }
}
/* Debug functions */
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 9f4322c07486..82e078c0665a 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -186,6 +186,7 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
unsigned long flags)
+ __releases(&rwb->rtmutex.wait_lock)
{
struct rt_mutex_base *rtm = &rwb->rtmutex;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 24df4d98f7d2..bf647097369c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -72,7 +72,7 @@
#c, atomic_long_read(&(sem)->count), \
(unsigned long) sem->magic, \
atomic_long_read(&(sem)->owner), (long)current, \
- list_empty(&(sem)->wait_list) ? "" : "not ")) \
+ rwsem_is_contended(sem) ? "" : "not ")) \
debug_locks_off(); \
} while (0)
#else
@@ -320,9 +320,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->magic = sem;
#endif
atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
atomic_long_set(&sem->owner, 0L);
+ scoped_guard (raw_spinlock_init, &sem->wait_lock) {
+ sem->first_waiter = NULL;
+ }
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
osq_lock_init(&sem->osq);
#endif
@@ -341,8 +342,6 @@ struct rwsem_waiter {
unsigned long timeout;
bool handoff_set;
};
-#define rwsem_first_waiter(sem) \
- list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
enum rwsem_wake_type {
RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
@@ -365,12 +364,22 @@ enum rwsem_wake_type {
*/
#define MAX_READERS_WAKEUP 0x100
-static inline void
-rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+static inline
+bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+ __must_hold(&sem->wait_lock)
{
- lockdep_assert_held(&sem->wait_lock);
- list_add_tail(&waiter->list, &sem->wait_list);
- /* caller will set RWSEM_FLAG_WAITERS */
+ if (list_empty(&waiter->list)) {
+ sem->first_waiter = NULL;
+ return false;
+ }
+
+ if (sem->first_waiter == waiter) {
+ sem->first_waiter = list_first_entry(&waiter->list,
+ struct rwsem_waiter, list);
+ }
+ list_del(&waiter->list);
+
+ return true;
}
/*
@@ -385,14 +394,24 @@ static inline bool
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
- list_del(&waiter->list);
- if (likely(!list_empty(&sem->wait_list)))
+ if (__rwsem_del_waiter(sem, waiter))
return true;
-
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
return false;
}
+static inline
+struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem,
+ const struct rwsem_waiter *waiter)
+ __must_hold(&sem->wait_lock)
+{
+ struct rwsem_waiter *next = list_first_entry(&waiter->list,
+ struct rwsem_waiter, list);
+ if (next == sem->first_waiter)
+ return NULL;
+ return next;
+}
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -411,7 +430,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
struct wake_q_head *wake_q)
{
- struct rwsem_waiter *waiter, *tmp;
+ struct rwsem_waiter *waiter, *next;
long oldcount, woken = 0, adjustment = 0;
struct list_head wlist;
@@ -421,7 +440,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* Take a peek at the queue head waiter such that we can determine
* the wakeup(s) to perform.
*/
- waiter = rwsem_first_waiter(sem);
+ waiter = sem->first_waiter;
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
if (wake_type == RWSEM_WAKE_ANY) {
@@ -506,25 +525,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* put them into wake_q to be woken up later.
*/
INIT_LIST_HEAD(&wlist);
- list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ do {
+ next = next_waiter(sem, waiter);
if (waiter->type == RWSEM_WAITING_FOR_WRITE)
continue;
woken++;
list_move_tail(&waiter->list, &wlist);
+ if (sem->first_waiter == waiter)
+ sem->first_waiter = next;
/*
* Limit # of readers that can be woken up per wakeup call.
*/
if (unlikely(woken >= MAX_READERS_WAKEUP))
break;
- }
+ } while ((waiter = next) != NULL);
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
oldcount = atomic_long_read(&sem->count);
- if (list_empty(&sem->wait_list)) {
+ if (!sem->first_waiter) {
/*
* Combined with list_move_tail() above, this implies
* rwsem_del_waiter().
@@ -545,7 +567,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
atomic_long_add(adjustment, &sem->count);
/* 2nd pass */
- list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+ list_for_each_entry_safe(waiter, next, &wlist, list) {
struct task_struct *tsk;
tsk = waiter->task;
@@ -577,7 +599,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
struct wake_q_head *wake_q)
__releases(&sem->wait_lock)
{
- bool first = rwsem_first_waiter(sem) == waiter;
+ bool first = sem->first_waiter == waiter;
wake_q_init(wake_q);
@@ -602,8 +624,9 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
*/
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
struct rwsem_waiter *waiter)
+ __must_hold(&sem->wait_lock)
{
- struct rwsem_waiter *first = rwsem_first_waiter(sem);
+ struct rwsem_waiter *first = sem->first_waiter;
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -639,7 +662,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
new |= RWSEM_WRITER_LOCKED;
new &= ~RWSEM_FLAG_HANDOFF;
- if (list_is_singular(&sem->wait_list))
+ if (list_empty(&first->list))
new &= ~RWSEM_FLAG_WAITERS;
}
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
@@ -659,7 +682,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
* success.
*/
- list_del(&waiter->list);
+ __rwsem_del_waiter(sem, waiter);
+
rwsem_set_owner(sem);
return true;
}
@@ -994,7 +1018,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
{
long adjustment = -RWSEM_READER_BIAS;
long rcnt = (count >> RWSEM_READER_SHIFT);
- struct rwsem_waiter waiter;
+ struct rwsem_waiter waiter, *first;
DEFINE_WAKE_Q(wake_q);
/*
@@ -1019,7 +1043,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
*/
if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
raw_spin_lock_irq(&sem->wait_lock);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
&wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
@@ -1035,7 +1059,8 @@ queue:
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list)) {
+ first = sem->first_waiter;
+ if (!first) {
/*
* In case the wait queue is empty and the lock isn't owned
* by a writer, this reader can exit the slowpath and return
@@ -1051,8 +1076,11 @@ queue:
return sem;
}
adjustment += RWSEM_FLAG_WAITERS;
+ INIT_LIST_HEAD(&waiter.list);
+ sem->first_waiter = &waiter;
+ } else {
+ list_add_tail(&waiter.list, &first->list);
}
- rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
@@ -1110,7 +1138,7 @@ out_nolock:
static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- struct rwsem_waiter waiter;
+ struct rwsem_waiter waiter, *first;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
@@ -1129,10 +1157,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
- rwsem_add_waiter(sem, &waiter);
- /* we're now waiting on the lock */
- if (rwsem_first_waiter(sem) != &waiter) {
+ first = sem->first_waiter;
+ if (first) {
+ list_add_tail(&waiter.list, &first->list);
rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
&wake_q);
if (!wake_q_empty(&wake_q)) {
@@ -1145,6 +1173,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
raw_spin_lock_irq(&sem->wait_lock);
}
} else {
+ INIT_LIST_HEAD(&waiter.list);
+ sem->first_waiter = &waiter;
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
@@ -1218,7 +1248,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -1239,7 +1269,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -1532,6 +1562,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
* lock for reading
*/
void __sched down_read(struct rw_semaphore *sem)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1541,6 +1572,7 @@ void __sched down_read(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_read);
int __sched down_read_interruptible(struct rw_semaphore *sem)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1555,6 +1587,7 @@ int __sched down_read_interruptible(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_read_interruptible);
int __sched down_read_killable(struct rw_semaphore *sem)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1572,6 +1605,7 @@ EXPORT_SYMBOL(down_read_killable);
* trylock for reading -- returns 1 if successful, 0 if contention
*/
int down_read_trylock(struct rw_semaphore *sem)
+ __no_context_analysis
{
int ret = __down_read_trylock(sem);
@@ -1585,6 +1619,7 @@ EXPORT_SYMBOL(down_read_trylock);
* lock for writing
*/
void __sched down_write(struct rw_semaphore *sem)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1596,6 +1631,7 @@ EXPORT_SYMBOL(down_write);
* lock for writing
*/
int __sched down_write_killable(struct rw_semaphore *sem)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
@@ -1614,6 +1650,7 @@ EXPORT_SYMBOL(down_write_killable);
* trylock for writing -- returns 1 if successful, 0 if contention
*/
int down_write_trylock(struct rw_semaphore *sem)
+ __no_context_analysis
{
int ret = __down_write_trylock(sem);
@@ -1628,6 +1665,7 @@ EXPORT_SYMBOL(down_write_trylock);
* release a read lock
*/
void up_read(struct rw_semaphore *sem)
+ __no_context_analysis
{
rwsem_release(&sem->dep_map, _RET_IP_);
__up_read(sem);
@@ -1638,6 +1676,7 @@ EXPORT_SYMBOL(up_read);
* release a write lock
*/
void up_write(struct rw_semaphore *sem)
+ __no_context_analysis
{
rwsem_release(&sem->dep_map, _RET_IP_);
__up_write(sem);
@@ -1648,6 +1687,7 @@ EXPORT_SYMBOL(up_write);
* downgrade write lock to read lock
*/
void downgrade_write(struct rw_semaphore *sem)
+ __no_context_analysis
{
lock_downgrade(&sem->dep_map, _RET_IP_);
__downgrade_write(sem);
@@ -1657,6 +1697,7 @@ EXPORT_SYMBOL(downgrade_write);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void down_read_nested(struct rw_semaphore *sem, int subclass)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -1665,6 +1706,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_read_nested);
int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -1679,6 +1721,7 @@ int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_read_killable_nested);
void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
@@ -1687,6 +1730,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
EXPORT_SYMBOL(_down_write_nest_lock);
void down_read_non_owner(struct rw_semaphore *sem)
+ __no_context_analysis
{
might_sleep();
__down_read(sem);
@@ -1701,6 +1745,7 @@ void down_read_non_owner(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_read_non_owner);
void down_write_nested(struct rw_semaphore *sem, int subclass)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -1709,6 +1754,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_write_nested);
int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+ __no_context_analysis
{
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -1724,6 +1770,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_write_killable_nested);
void up_read_non_owner(struct rw_semaphore *sem)
+ __no_context_analysis
{
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
__up_read(sem);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 3ef032e22f7e..74d41433ba13 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -21,7 +21,7 @@
* too.
*
* The ->count variable represents how many more tasks can acquire this
- * semaphore. If it's zero, there may be tasks waiting on the wait_list.
+ * semaphore. If it's zero, there may be waiters.
*/
#include <linux/compiler.h>
@@ -226,7 +226,7 @@ void __sched up(struct semaphore *sem)
hung_task_sem_clear_if_holder(sem);
- if (likely(list_empty(&sem->wait_list)))
+ if (likely(!sem->first_waiter))
sem->count++;
else
__up(sem, &wake_q);
@@ -244,6 +244,21 @@ struct semaphore_waiter {
bool up;
};
+static inline
+void sem_del_waiter(struct semaphore *sem, struct semaphore_waiter *waiter)
+{
+ if (list_empty(&waiter->list)) {
+ sem->first_waiter = NULL;
+ return;
+ }
+
+ if (sem->first_waiter == waiter) {
+ sem->first_waiter = list_first_entry(&waiter->list,
+ struct semaphore_waiter, list);
+ }
+ list_del(&waiter->list);
+}
+
/*
* Because this function is inlined, the 'state' parameter will be
* constant, and thus optimised away by the compiler. Likewise the
@@ -252,9 +267,15 @@ struct semaphore_waiter {
static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
- struct semaphore_waiter waiter;
-
- list_add_tail(&waiter.list, &sem->wait_list);
+ struct semaphore_waiter waiter, *first;
+
+ first = sem->first_waiter;
+ if (first) {
+ list_add_tail(&waiter.list, &first->list);
+ } else {
+ INIT_LIST_HEAD(&waiter.list);
+ sem->first_waiter = &waiter;
+ }
waiter.task = current;
waiter.up = false;
@@ -274,11 +295,11 @@ static inline int __sched ___down_common(struct semaphore *sem, long state,
}
timed_out:
- list_del(&waiter.list);
+ sem_del_waiter(sem, &waiter);
return -ETIME;
interrupted:
- list_del(&waiter.list);
+ sem_del_waiter(sem, &waiter);
return -EINTR;
}
@@ -321,9 +342,9 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
static noinline void __sched __up(struct semaphore *sem,
struct wake_q_head *wake_q)
{
- struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
- struct semaphore_waiter, list);
- list_del(&waiter->list);
+ struct semaphore_waiter *waiter = sem->first_waiter;
+
+ sem_del_waiter(sem, waiter);
waiter->up = true;
wake_q_add(wake_q, waiter->task);
}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 7685defd7c52..b42d293da38b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -64,8 +64,9 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
* time (making _this_ CPU preemptible if possible), and we also signal
* towards that other CPU that it should break the lock ASAP.
*/
-#define BUILD_LOCK_OPS(op, locktype) \
+#define BUILD_LOCK_OPS(op, locktype, lock_ctx_op) \
static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
+ lock_ctx_op(lock) \
{ \
for (;;) { \
preempt_disable(); \
@@ -78,6 +79,7 @@ static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
} \
\
static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
+ lock_ctx_op(lock) \
{ \
unsigned long flags; \
\
@@ -96,11 +98,13 @@ static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
} \
\
static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
+ lock_ctx_op(lock) \
{ \
_raw_##op##_lock_irqsave(lock); \
} \
\
static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
+ lock_ctx_op(lock) \
{ \
unsigned long flags; \
\
@@ -123,11 +127,11 @@ static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_irqsave()
* __[spin|read|write]_lock_bh()
*/
-BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(spin, raw_spinlock, __acquires);
#ifndef CONFIG_PREEMPT_RT
-BUILD_LOCK_OPS(read, rwlock);
-BUILD_LOCK_OPS(write, rwlock);
+BUILD_LOCK_OPS(read, rwlock, __acquires_shared);
+BUILD_LOCK_OPS(write, rwlock, __acquires);
#endif
#endif
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 31a785afee6c..016f0db892a5 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -4,24 +4,21 @@
#define MUTEX mutex
#define MUTEX_WAITER mutex_waiter
+#define WAIT_LOCK wait_lock
static inline struct mutex_waiter *
__ww_waiter_first(struct mutex *lock)
+ __must_hold(&lock->wait_lock)
{
- struct mutex_waiter *w;
-
- w = list_first_entry(&lock->wait_list, struct mutex_waiter, list);
- if (list_entry_is_head(w, &lock->wait_list, list))
- return NULL;
-
- return w;
+ return lock->first_waiter;
}
static inline struct mutex_waiter *
__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
+ __must_hold(&lock->wait_lock)
{
w = list_next_entry(w, list);
- if (list_entry_is_head(w, &lock->wait_list, list))
+ if (lock->first_waiter == w)
return NULL;
return w;
@@ -29,9 +26,10 @@ __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
static inline struct mutex_waiter *
__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
+ __must_hold(&lock->wait_lock)
{
w = list_prev_entry(w, list);
- if (list_entry_is_head(w, &lock->wait_list, list))
+ if (lock->first_waiter == w)
return NULL;
return w;
@@ -39,23 +37,20 @@ __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
static inline struct mutex_waiter *
__ww_waiter_last(struct mutex *lock)
+ __must_hold(&lock->wait_lock)
{
- struct mutex_waiter *w;
-
- w = list_last_entry(&lock->wait_list, struct mutex_waiter, list);
- if (list_entry_is_head(w, &lock->wait_list, list))
- return NULL;
+ struct mutex_waiter *w = lock->first_waiter;
+ if (w)
+ w = list_prev_entry(w, list);
return w;
}
static inline void
__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos)
+ __must_hold(&lock->wait_lock)
{
- struct list_head *p = &lock->wait_list;
- if (pos)
- p = &pos->list;
- __mutex_add_waiter(lock, waiter, p);
+ __mutex_add_waiter(lock, waiter, pos);
}
static inline struct task_struct *
@@ -71,16 +66,19 @@ __ww_mutex_has_waiters(struct mutex *lock)
}
static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags)
+ __acquires(&lock->wait_lock)
{
raw_spin_lock_irqsave(&lock->wait_lock, *flags);
}
static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags)
+ __releases(&lock->wait_lock)
{
raw_spin_unlock_irqrestore(&lock->wait_lock, *flags);
}
static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
+ __must_hold(&lock->wait_lock)
{
lockdep_assert_held(&lock->wait_lock);
}
@@ -89,9 +87,11 @@ static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
#define MUTEX rt_mutex
#define MUTEX_WAITER rt_mutex_waiter
+#define WAIT_LOCK rtmutex.wait_lock
static inline struct rt_mutex_waiter *
__ww_waiter_first(struct rt_mutex *lock)
+ __must_hold(&lock->rtmutex.wait_lock)
{
struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
if (!n)
@@ -119,6 +119,7 @@ __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
static inline struct rt_mutex_waiter *
__ww_waiter_last(struct rt_mutex *lock)
+ __must_hold(&lock->rtmutex.wait_lock)
{
struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
if (!n)
@@ -140,21 +141,25 @@ __ww_mutex_owner(struct rt_mutex *lock)
static inline bool
__ww_mutex_has_waiters(struct rt_mutex *lock)
+ __must_hold(&lock->rtmutex.wait_lock)
{
return rt_mutex_has_waiters(&lock->rtmutex);
}
static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
+ __acquires(&lock->rtmutex.wait_lock)
{
raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags);
}
static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
+ __releases(&lock->rtmutex.wait_lock)
{
raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags);
}
static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
+ __must_hold(&lock->rtmutex.wait_lock)
{
lockdep_assert_held(&lock->rtmutex.wait_lock);
}
@@ -285,11 +290,11 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
debug_mutex_wake_waiter(lock, waiter);
#endif
/*
- * When waking up the task to die, be sure to clear the
- * blocked_on pointer. Otherwise we can see circular
- * blocked_on relationships that can't resolve.
+ * When waking up the task to die, be sure to set the
+ * blocked_on to PROXY_WAKING. Otherwise we can see
+ * circular blocked_on relationships that can't resolve.
*/
- __clear_task_blocked_on(waiter->task, lock);
+ set_task_blocked_on_waking(waiter->task, lock);
wake_q_add(wake_q, waiter->task);
}
@@ -307,6 +312,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
struct ww_acquire_ctx *ww_ctx,
struct ww_acquire_ctx *hold_ctx,
struct wake_q_head *wake_q)
+ __must_hold(&lock->WAIT_LOCK)
{
struct task_struct *owner = __ww_mutex_owner(lock);
@@ -339,15 +345,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
*/
if (owner != current) {
/*
- * When waking up the task to wound, be sure to clear the
- * blocked_on pointer. Otherwise we can see circular
- * blocked_on relationships that can't resolve.
+ * When waking up the task to wound, be sure to set the
+ * blocked_on to PROXY_WAKING. Otherwise we can see
+ * circular blocked_on relationships that can't resolve.
*
* NOTE: We pass NULL here instead of lock, because we
* are waking the mutex owner, who may be currently
* blocked on a different mutex.
*/
- __clear_task_blocked_on(owner, NULL);
+ set_task_blocked_on_waking(owner, NULL);
wake_q_add(wake_q, owner);
}
return true;
@@ -371,6 +377,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
static void
__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx,
struct wake_q_head *wake_q)
+ __must_hold(&lock->WAIT_LOCK)
{
struct MUTEX_WAITER *cur;
@@ -397,6 +404,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
DEFINE_WAKE_Q(wake_q);
unsigned long flags;
+ bool has_waiters;
ww_mutex_lock_acquired(lock, ctx);
@@ -418,7 +426,8 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
* __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
* and/or !empty list.
*/
- if (likely(!__ww_mutex_has_waiters(&lock->base)))
+ has_waiters = data_race(__ww_mutex_has_waiters(&lock->base));
+ if (likely(!has_waiters))
return;
/*
@@ -464,6 +473,7 @@ __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
static inline int
__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
struct ww_acquire_ctx *ctx)
+ __must_hold(&lock->WAIT_LOCK)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
@@ -514,6 +524,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
struct MUTEX *lock,
struct ww_acquire_ctx *ww_ctx,
struct wake_q_head *wake_q)
+ __must_hold(&lock->WAIT_LOCK)
{
struct MUTEX_WAITER *cur, *pos = NULL;
bool is_wait_die;
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index c7196de838ed..e07fb3b96bc3 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -90,6 +90,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
EXPORT_SYMBOL(ww_mutex_lock_interruptible);
void __sched ww_mutex_unlock(struct ww_mutex *lock)
+ __no_context_analysis
{
struct rt_mutex *rtm = &lock->base;
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index be74917802ad..43b1bb01fd27 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -169,9 +169,10 @@ config MODVERSIONS
make them incompatible with the kernel you are running. If
unsure, say N.
+if MODVERSIONS
+
choice
prompt "Module versioning implementation"
- depends on MODVERSIONS
help
Select the tool used to calculate symbol versions for modules.
@@ -206,7 +207,7 @@ endchoice
config ASM_MODVERSIONS
bool
- default HAVE_ASM_MODVERSIONS && MODVERSIONS
+ default HAVE_ASM_MODVERSIONS
help
This enables module versioning for exported symbols also from
assembly. This can be enabled only when the target architecture
@@ -214,7 +215,6 @@ config ASM_MODVERSIONS
config EXTENDED_MODVERSIONS
bool "Extended Module Versioning Support"
- depends on MODVERSIONS
help
This enables extended MODVERSIONs support, allowing long symbol
names to be versioned.
@@ -224,7 +224,6 @@ config EXTENDED_MODVERSIONS
config BASIC_MODVERSIONS
bool "Basic Module Versioning Support"
- depends on MODVERSIONS
default y
help
This enables basic MODVERSIONS support, allowing older tools or
@@ -237,6 +236,8 @@ config BASIC_MODVERSIONS
This is enabled by default when MODVERSIONS are enabled.
If unsure, say Y.
+endif # MODVERSIONS
+
config MODULE_SRCVERSION_ALL
bool "Source checksum for all modules"
help
@@ -277,10 +278,11 @@ config MODULE_SIG_FORCE
Reject unsigned modules or signed modules for which we don't have a
key. Without this, such modules will simply taint the kernel.
+if MODULE_SIG || IMA_APPRAISE_MODSIG
+
config MODULE_SIG_ALL
bool "Automatically sign all modules"
default y
- depends on MODULE_SIG || IMA_APPRAISE_MODSIG
help
Sign all modules during make modules_install. Without this option,
modules must be signed manually, using the scripts/sign-file tool.
@@ -290,7 +292,6 @@ comment "Do not forget to sign required modules with scripts/sign-file"
choice
prompt "Hash algorithm to sign modules"
- depends on MODULE_SIG || IMA_APPRAISE_MODSIG
default MODULE_SIG_SHA512
help
This determines which sort of hashing algorithm will be used during
@@ -327,7 +328,6 @@ endchoice
config MODULE_SIG_HASH
string
- depends on MODULE_SIG || IMA_APPRAISE_MODSIG
default "sha256" if MODULE_SIG_SHA256
default "sha384" if MODULE_SIG_SHA384
default "sha512" if MODULE_SIG_SHA512
@@ -335,6 +335,8 @@ config MODULE_SIG_HASH
default "sha3-384" if MODULE_SIG_SHA3_384
default "sha3-512" if MODULE_SIG_SHA3_512
+endif # MODULE_SIG || IMA_APPRAISE_MODSIG
+
config MODULE_COMPRESS
bool "Module compression"
help
@@ -350,9 +352,10 @@ config MODULE_COMPRESS
If unsure, say N.
+if MODULE_COMPRESS
+
choice
prompt "Module compression type"
- depends on MODULE_COMPRESS
help
Choose the supported algorithm for module compression.
@@ -379,7 +382,6 @@ endchoice
config MODULE_COMPRESS_ALL
bool "Automatically compress all modules"
default y
- depends on MODULE_COMPRESS
help
Compress all modules during 'make modules_install'.
@@ -389,7 +391,6 @@ config MODULE_COMPRESS_ALL
config MODULE_DECOMPRESS
bool "Support in-kernel module decompression"
- depends on MODULE_COMPRESS
select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
select XZ_DEC if MODULE_COMPRESS_XZ
select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
@@ -400,6 +401,8 @@ config MODULE_DECOMPRESS
If unsure, say N.
+endif # MODULE_COMPRESS
+
config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
bool "Allow loading of modules with missing namespace imports"
help
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 618202578b42..061161cc79d9 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -53,10 +53,8 @@ extern const size_t modinfo_attrs_count;
/* Provided by the linker */
extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
-extern const struct kernel_symbol __start___ksymtab_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const u32 __start___kcrctab[];
-extern const u32 __start___kcrctab_gpl[];
+extern const u8 __start___kflagstab[];
#define KMOD_PATH_LEN 256
extern char modprobe_path[];
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 2bac4c7cd019..46dd8d25a605 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -11,6 +11,7 @@
#include <linux/extable.h>
#include <linux/moduleloader.h>
#include <linux/module_signature.h>
+#include <linux/module_symbol.h>
#include <linux/trace_events.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
@@ -87,7 +88,7 @@ struct mod_tree_root mod_tree __cacheline_aligned = {
struct symsearch {
const struct kernel_symbol *start, *stop;
const u32 *crcs;
- enum mod_license license;
+ const u8 *flagstab;
};
/*
@@ -364,19 +365,21 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
struct find_symbol_arg *fsa)
{
struct kernel_symbol *sym;
-
- if (!fsa->gplok && syms->license == GPL_ONLY)
- return false;
+ u8 sym_flags;
sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
sizeof(struct kernel_symbol), cmp_name);
if (!sym)
return false;
+ sym_flags = *(syms->flagstab + (sym - syms->start));
+ if (!fsa->gplok && (sym_flags & KSYM_FLAG_GPL_ONLY))
+ return false;
+
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, sym - syms->start);
fsa->sym = sym;
- fsa->license = syms->license;
+ fsa->license = (sym_flags & KSYM_FLAG_GPL_ONLY) ? GPL_ONLY : NOT_GPL_ONLY;
return true;
}
@@ -387,36 +390,31 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
*/
bool find_symbol(struct find_symbol_arg *fsa)
{
- static const struct symsearch arr[] = {
- { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
- NOT_GPL_ONLY },
- { __start___ksymtab_gpl, __stop___ksymtab_gpl,
- __start___kcrctab_gpl,
- GPL_ONLY },
+ const struct symsearch syms = {
+ .start = __start___ksymtab,
+ .stop = __stop___ksymtab,
+ .crcs = __start___kcrctab,
+ .flagstab = __start___kflagstab,
};
struct module *mod;
- unsigned int i;
- for (i = 0; i < ARRAY_SIZE(arr); i++)
- if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
- return true;
+ if (find_exported_symbol_in_section(&syms, NULL, fsa))
+ return true;
list_for_each_entry_rcu(mod, &modules, list,
lockdep_is_held(&module_mutex)) {
- struct symsearch arr[] = {
- { mod->syms, mod->syms + mod->num_syms, mod->crcs,
- NOT_GPL_ONLY },
- { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
- mod->gpl_crcs,
- GPL_ONLY },
+ const struct symsearch syms = {
+ .start = mod->syms,
+ .stop = mod->syms + mod->num_syms,
+ .crcs = mod->crcs,
+ .flagstab = mod->flagstab,
};
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- for (i = 0; i < ARRAY_SIZE(arr); i++)
- if (find_exported_symbol_in_section(&arr[i], mod, fsa))
- return true;
+ if (find_exported_symbol_in_section(&syms, mod, fsa))
+ return true;
}
pr_debug("Failed to find symbol %s\n", fsa->name);
@@ -607,6 +605,36 @@ static const struct module_attribute modinfo_##field = { \
MODINFO_ATTR(version);
MODINFO_ATTR(srcversion);
+static void setup_modinfo_import_ns(struct module *mod, const char *s)
+{
+ mod->imported_namespaces = NULL;
+}
+
+static ssize_t show_modinfo_import_ns(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sysfs_emit(buffer, "%s\n", mk->mod->imported_namespaces);
+}
+
+static int modinfo_import_ns_exists(struct module *mod)
+{
+ return mod->imported_namespaces != NULL;
+}
+
+static void free_modinfo_import_ns(struct module *mod)
+{
+ kfree(mod->imported_namespaces);
+ mod->imported_namespaces = NULL;
+}
+
+static const struct module_attribute modinfo_import_ns = {
+ .attr = { .name = "import_ns", .mode = 0444 },
+ .show = show_modinfo_import_ns,
+ .setup = setup_modinfo_import_ns,
+ .test = modinfo_import_ns_exists,
+ .free = free_modinfo_import_ns,
+};
+
static struct {
char name[MODULE_NAME_LEN];
char taints[MODULE_FLAGS_BUF_SIZE];
@@ -1058,6 +1086,7 @@ const struct module_attribute *const modinfo_attrs[] = {
&module_uevent,
&modinfo_version,
&modinfo_srcversion,
+ &modinfo_import_ns,
&modinfo_initstate,
&modinfo_coresize,
#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
@@ -1408,7 +1437,7 @@ static void free_module(struct module *mod)
module_unload_free(mod);
/* Free any allocated parameters. */
- destroy_params(mod->kp, mod->num_kp);
+ module_destroy_params(mod->kp, mod->num_kp);
if (is_livepatch_module(mod))
free_module_elf(mod);
@@ -1466,29 +1495,17 @@ EXPORT_SYMBOL_GPL(__symbol_get);
*/
static int verify_exported_symbols(struct module *mod)
{
- unsigned int i;
const struct kernel_symbol *s;
- struct {
- const struct kernel_symbol *sym;
- unsigned int num;
- } arr[] = {
- { mod->syms, mod->num_syms },
- { mod->gpl_syms, mod->num_gpl_syms },
- };
-
- for (i = 0; i < ARRAY_SIZE(arr); i++) {
- for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- struct find_symbol_arg fsa = {
- .name = kernel_symbol_name(s),
- .gplok = true,
- };
- if (find_symbol(&fsa)) {
- pr_err("%s: exports duplicate symbol %s"
- " (owned by %s)\n",
- mod->name, kernel_symbol_name(s),
- module_name(fsa.owner));
- return -ENOEXEC;
- }
+ for (s = mod->syms; s < mod->syms + mod->num_syms; s++) {
+ struct find_symbol_arg fsa = {
+ .name = kernel_symbol_name(s),
+ .gplok = true,
+ };
+ if (find_symbol(&fsa)) {
+ pr_err("%s: exports duplicate symbol %s (owned by %s)\n",
+ mod->name, kernel_symbol_name(s),
+ module_name(fsa.owner));
+ return -ENOEXEC;
}
}
return 0;
@@ -1568,6 +1585,13 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
break;
default:
+ if (sym[i].st_shndx >= info->hdr->e_shnum) {
+ pr_err("%s: Symbol %s has an invalid section index %u (max %u)\n",
+ mod->name, name, sym[i].st_shndx, info->hdr->e_shnum - 1);
+ ret = -ENOEXEC;
+ break;
+ }
+
/* Divert to percpu allocation if a percpu var. */
if (sym[i].st_shndx == info->index.pcpu)
secbase = (unsigned long)mod_percpu(mod);
@@ -1753,11 +1777,43 @@ static void module_license_taint_check(struct module *mod, const char *license)
}
}
+static int copy_modinfo_import_ns(struct module *mod, struct load_info *info)
+{
+ char *ns;
+ size_t len, total_len = 0;
+ char *buf, *p;
+
+ for_each_modinfo_entry(ns, info, "import_ns")
+ total_len += strlen(ns) + 1;
+
+ if (!total_len) {
+ mod->imported_namespaces = NULL;
+ return 0;
+ }
+
+ buf = kmalloc(total_len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ p = buf;
+ for_each_modinfo_entry(ns, info, "import_ns") {
+ len = strlen(ns);
+ memcpy(p, ns, len);
+ p += len;
+ *p++ = '\n';
+ }
+ /* Replace trailing newline with null terminator. */
+ *(p - 1) = '\0';
+
+ mod->imported_namespaces = buf;
+ return 0;
+}
+
static int setup_modinfo(struct module *mod, struct load_info *info)
{
const struct module_attribute *attr;
char *imported_namespace;
- int i;
+ int i, err;
for (i = 0; (attr = modinfo_attrs[i]); i++) {
if (attr->setup)
@@ -1776,6 +1832,10 @@ static int setup_modinfo(struct module *mod, struct load_info *info)
}
}
+ err = copy_modinfo_import_ns(mod, info);
+ if (err)
+ return err;
+
return 0;
}
@@ -2603,10 +2663,14 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->syms = section_objs(info, "__ksymtab",
sizeof(*mod->syms), &mod->num_syms);
mod->crcs = section_addr(info, "__kcrctab");
- mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
- sizeof(*mod->gpl_syms),
- &mod->num_gpl_syms);
- mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+ mod->flagstab = section_addr(info, "__kflagstab");
+
+ if (section_addr(info, "__ksymtab_gpl"))
+ pr_warn("%s: ignoring obsolete section __ksymtab_gpl\n",
+ mod->name);
+ if (section_addr(info, "__kcrctab_gpl"))
+ pr_warn("%s: ignoring obsolete section __kcrctab_gpl\n",
+ mod->name);
#ifdef CONFIG_CONSTRUCTORS
mod->ctors = section_objs(info, ".ctors",
@@ -2810,11 +2874,14 @@ out_err:
return ret;
}
-static int check_export_symbol_versions(struct module *mod)
+static int check_export_symbol_sections(struct module *mod)
{
+ if (mod->num_syms && !mod->flagstab) {
+ pr_err("%s: no flags for exported symbols\n", mod->name);
+ return -ENOEXEC;
+ }
#ifdef CONFIG_MODVERSIONS
- if ((mod->num_syms && !mod->crcs) ||
- (mod->num_gpl_syms && !mod->gpl_crcs)) {
+ if (mod->num_syms && !mod->crcs) {
return try_to_force_load(mod,
"no versions for exported symbols");
}
@@ -3038,15 +3105,19 @@ static noinline int do_init_module(struct module *mod)
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
if (ret < 0) {
+ /*
+ * -EEXIST is reserved by [f]init_module() to signal to userspace that
+ * a module with this name is already loaded. Use something else if the
+ * module itself is returning that.
+ */
+ if (ret == -EEXIST)
+ ret = -EBUSY;
+
goto fail_free_freeinit;
}
- if (ret > 0) {
- pr_warn("%s: '%s'->init suspiciously returned %d, it should "
- "follow 0/-E convention\n"
- "%s: loading module anyway...\n",
- __func__, mod->name, ret, __func__);
- dump_stack();
- }
+ if (ret > 0)
+ pr_warn("%s: init suspiciously returned %d, it should follow 0/-E convention\n",
+ mod->name, ret);
/* Now it's a first class citizen! */
mod->state = MODULE_STATE_LIVE;
@@ -3427,7 +3498,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto free_unload;
- err = check_export_symbol_versions(mod);
+ err = check_export_symbol_sections(mod);
if (err)
goto free_unload;
@@ -3512,7 +3583,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod_sysfs_teardown(mod);
coming_cleanup:
mod->state = MODULE_STATE_GOING;
- destroy_params(mod->kp, mod->num_kp);
+ module_destroy_params(mod->kp, mod->num_kp);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
@@ -3544,12 +3615,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
mutex_unlock(&module_mutex);
free_module:
mod_stat_bump_invalid(info, flags);
- /* Free lock-classes; relies on the preceding sync_rcu() */
- for_class_mod_mem_type(type, core_data) {
- lockdep_free_key_range(mod->mem[type].base,
- mod->mem[type].size);
- }
-
module_memory_restore_rox(mod);
module_deallocate(mod, info);
free_copy:
diff --git a/kernel/module/signing.c b/kernel/module/signing.c
index a2ff4242e623..590ba29c85ab 100644
--- a/kernel/module/signing.c
+++ b/kernel/module/signing.c
@@ -70,7 +70,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
int module_sig_check(struct load_info *info, int flags)
{
int err = -ENODATA;
- const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const unsigned long markerlen = sizeof(MODULE_SIGNATURE_MARKER) - 1;
const char *reason;
const void *mod = info->hdr;
bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
@@ -81,7 +81,7 @@ int module_sig_check(struct load_info *info, int flags)
*/
if (!mangled_module &&
info->len > markerlen &&
- memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ memcmp(mod + info->len - markerlen, MODULE_SIGNATURE_MARKER, markerlen) == 0) {
/* We truncate the module to discard the signature */
info->len -= markerlen;
err = mod_verify_sig(mod, info);
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
index 00132d12487c..a0eee2fe4368 100644
--- a/kernel/module_signature.c
+++ b/kernel/module_signature.c
@@ -24,7 +24,7 @@ int mod_check_sig(const struct module_signature *ms, size_t file_len,
if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms))
return -EBADMSG;
- if (ms->id_type != PKEY_ID_PKCS7) {
+ if (ms->id_type != MODULE_SIGNATURE_TYPE_PKCS7) {
pr_err("%s: not signed with expected PKCS#7 message\n",
name);
return -ENOPKG;
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index bdc3c86231d3..3166c1fd844a 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -309,3 +309,9 @@ void __ns_ref_active_get(struct ns_common *ns)
return;
}
}
+
+bool may_see_all_namespaces(void)
+{
+ return (task_active_pid_ns(current) == &init_pid_ns) &&
+ ns_capable_noaudit(init_pid_ns.user_ns, CAP_SYS_ADMIN);
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 259c4b4f1eeb..d9d3d5973bf5 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
+#include <linux/ns/ns_common_types.h>
#include <linux/init_task.h>
#include <linux/mnt_namespace.h>
#include <linux/utsname.h>
@@ -95,7 +96,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
if (!new_nsp)
return ERR_PTR(-ENOMEM);
- new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
+ user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
@@ -170,9 +172,7 @@ int copy_namespaces(u64 flags, struct task_struct *tsk)
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
- if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
+ if (likely(!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))) {
if ((flags & CLONE_VM) ||
likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
get_nsproxy(old_ns);
@@ -212,18 +212,26 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
struct user_namespace *user_ns;
+ u64 flags = unshare_flags;
int err = 0;
- if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
- CLONE_NEWTIME)))
+ if (!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+ /*
+ * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
+ * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
+ */
+ if (flags & UNSHARE_EMPTY_MNTNS) {
+ flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
+ flags |= CLONE_EMPTY_MNTNS;
+ }
+
+ *new_nsp = create_new_namespaces(flags, current, user_ns,
new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);
@@ -292,9 +300,7 @@ int exec_task_namespaces(void)
static int check_setns_flags(unsigned long flags)
{
- if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
- CLONE_NEWPID | CLONE_NEWCGROUP)))
+ if (!flags || (flags & ~CLONE_NS_ALL))
return -EINVAL;
#ifndef CONFIG_USER_NS
diff --git a/kernel/nstree.c b/kernel/nstree.c
index f36c59e6951d..6d12e5900ac0 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -515,32 +515,11 @@ static inline bool __must_check ns_requested(const struct klistns *kls,
static inline bool __must_check may_list_ns(const struct klistns *kls,
struct ns_common *ns)
{
- if (kls->user_ns) {
- if (kls->userns_capable)
- return true;
- } else {
- struct ns_common *owner;
- struct user_namespace *user_ns;
-
- owner = ns_owner(ns);
- if (owner)
- user_ns = to_user_ns(owner);
- else
- user_ns = &init_user_ns;
- if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN))
- return true;
- }
-
- if (is_current_namespace(ns))
+ if (kls->user_ns && kls->userns_capable)
return true;
-
- if (ns->ns_type != CLONE_NEWUSER)
- return false;
-
- if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN))
+ if (is_current_namespace(ns))
return true;
-
- return false;
+ return may_see_all_namespaces();
}
static inline void ns_put(struct ns_common *ns)
@@ -600,7 +579,7 @@ static ssize_t do_listns_userns(struct klistns *kls)
ret = 0;
head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head;
- kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
+ kls->userns_capable = may_see_all_namespaces();
rcu_read_lock();
diff --git a/kernel/padata.c b/kernel/padata.c
index 8657e6e0c224..0d3ea1b68b1f 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -535,7 +535,8 @@ static void padata_init_reorder_list(struct parallel_data *pd)
}
/* Allocate and initialize the internal cpumask dependend resources. */
-static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
+static struct parallel_data *padata_alloc_pd(struct padata_shell *ps,
+ int offlining_cpu)
{
struct padata_instance *pinst = ps->pinst;
struct parallel_data *pd;
@@ -561,6 +562,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask);
cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask);
+ if (offlining_cpu >= 0) {
+ __cpumask_clear_cpu(offlining_cpu, pd->cpumask.pcpu);
+ __cpumask_clear_cpu(offlining_cpu, pd->cpumask.cbcpu);
+ }
padata_init_reorder_list(pd);
padata_init_squeues(pd);
@@ -607,11 +612,11 @@ static void __padata_stop(struct padata_instance *pinst)
}
/* Replace the internal control structure with a new one. */
-static int padata_replace_one(struct padata_shell *ps)
+static int padata_replace_one(struct padata_shell *ps, int offlining_cpu)
{
struct parallel_data *pd_new;
- pd_new = padata_alloc_pd(ps);
+ pd_new = padata_alloc_pd(ps, offlining_cpu);
if (!pd_new)
return -ENOMEM;
@@ -621,7 +626,7 @@ static int padata_replace_one(struct padata_shell *ps)
return 0;
}
-static int padata_replace(struct padata_instance *pinst)
+static int padata_replace(struct padata_instance *pinst, int offlining_cpu)
{
struct padata_shell *ps;
int err = 0;
@@ -629,7 +634,7 @@ static int padata_replace(struct padata_instance *pinst)
pinst->flags |= PADATA_RESET;
list_for_each_entry(ps, &pinst->pslist, list) {
- err = padata_replace_one(ps);
+ err = padata_replace_one(ps, offlining_cpu);
if (err)
break;
}
@@ -646,9 +651,21 @@ static int padata_replace(struct padata_instance *pinst)
/* If cpumask contains no active cpu, we mark the instance as invalid. */
static bool padata_validate_cpumask(struct padata_instance *pinst,
- const struct cpumask *cpumask)
+ const struct cpumask *cpumask,
+ int offlining_cpu)
{
- if (!cpumask_intersects(cpumask, cpu_online_mask)) {
+ cpumask_copy(pinst->validate_cpumask, cpu_online_mask);
+
+ /*
+ * @offlining_cpu is still in cpu_online_mask, so remove it here for
+ * validation. Using a sub-CPUHP_TEARDOWN_CPU hotplug state where
+ * @offlining_cpu wouldn't be in the online mask doesn't work because
+ * padata_cpu_offline() can fail but such a state doesn't allow failure.
+ */
+ if (offlining_cpu >= 0)
+ __cpumask_clear_cpu(offlining_cpu, pinst->validate_cpumask);
+
+ if (!cpumask_intersects(cpumask, pinst->validate_cpumask)) {
pinst->flags |= PADATA_INVALID;
return false;
}
@@ -664,13 +681,13 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
int valid;
int err;
- valid = padata_validate_cpumask(pinst, pcpumask);
+ valid = padata_validate_cpumask(pinst, pcpumask, -1);
if (!valid) {
__padata_stop(pinst);
goto out_replace;
}
- valid = padata_validate_cpumask(pinst, cbcpumask);
+ valid = padata_validate_cpumask(pinst, cbcpumask, -1);
if (!valid)
__padata_stop(pinst);
@@ -678,7 +695,7 @@ out_replace:
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
- err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst);
+ err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst, -1);
if (valid)
__padata_start(pinst);
@@ -730,36 +747,6 @@ EXPORT_SYMBOL(padata_set_cpumask);
#ifdef CONFIG_HOTPLUG_CPU
-static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
-{
- int err = 0;
-
- if (cpumask_test_cpu(cpu, cpu_online_mask)) {
- err = padata_replace(pinst);
-
- if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
- padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
- __padata_start(pinst);
- }
-
- return err;
-}
-
-static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
-{
- int err = 0;
-
- if (!cpumask_test_cpu(cpu, cpu_online_mask)) {
- if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
- !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
- __padata_stop(pinst);
-
- err = padata_replace(pinst);
- }
-
- return err;
-}
-
static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
{
return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
@@ -771,27 +758,39 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
struct padata_instance *pinst;
int ret;
- pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node);
+ pinst = hlist_entry_safe(node, struct padata_instance, cpuhp_node);
if (!pinst_has_cpu(pinst, cpu))
return 0;
mutex_lock(&pinst->lock);
- ret = __padata_add_cpu(pinst, cpu);
+
+ ret = padata_replace(pinst, -1);
+
+ if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu, -1) &&
+ padata_validate_cpumask(pinst, pinst->cpumask.cbcpu, -1))
+ __padata_start(pinst);
+
mutex_unlock(&pinst->lock);
return ret;
}
-static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node)
+static int padata_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
struct padata_instance *pinst;
int ret;
- pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node);
+ pinst = hlist_entry_safe(node, struct padata_instance, cpuhp_node);
if (!pinst_has_cpu(pinst, cpu))
return 0;
mutex_lock(&pinst->lock);
- ret = __padata_remove_cpu(pinst, cpu);
+
+ if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu, cpu) ||
+ !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu, cpu))
+ __padata_stop(pinst);
+
+ ret = padata_replace(pinst, cpu);
+
mutex_unlock(&pinst->lock);
return ret;
}
@@ -802,15 +801,14 @@ static enum cpuhp_state hp_online;
static void __padata_free(struct padata_instance *pinst)
{
#ifdef CONFIG_HOTPLUG_CPU
- cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD,
- &pinst->cpu_dead_node);
- cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node);
+ cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpuhp_node);
#endif
WARN_ON(!list_empty(&pinst->pslist));
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
+ free_cpumask_var(pinst->validate_cpumask);
destroy_workqueue(pinst->serial_wq);
destroy_workqueue(pinst->parallel_wq);
kfree(pinst);
@@ -971,10 +969,10 @@ struct padata_instance *padata_alloc(const char *name)
if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
goto err_free_serial_wq;
- if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
- free_cpumask_var(pinst->cpumask.pcpu);
- goto err_free_serial_wq;
- }
+ if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL))
+ goto err_free_p_mask;
+ if (!alloc_cpumask_var(&pinst->validate_cpumask, GFP_KERNEL))
+ goto err_free_cb_mask;
INIT_LIST_HEAD(&pinst->pslist);
@@ -982,7 +980,7 @@ struct padata_instance *padata_alloc(const char *name)
cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask);
if (padata_setup_cpumasks(pinst))
- goto err_free_masks;
+ goto err_free_v_mask;
__padata_start(pinst);
@@ -991,18 +989,19 @@ struct padata_instance *padata_alloc(const char *name)
#ifdef CONFIG_HOTPLUG_CPU
cpuhp_state_add_instance_nocalls_cpuslocked(hp_online,
- &pinst->cpu_online_node);
- cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD,
- &pinst->cpu_dead_node);
+ &pinst->cpuhp_node);
#endif
cpus_read_unlock();
return pinst;
-err_free_masks:
- free_cpumask_var(pinst->cpumask.pcpu);
+err_free_v_mask:
+ free_cpumask_var(pinst->validate_cpumask);
+err_free_cb_mask:
free_cpumask_var(pinst->cpumask.cbcpu);
+err_free_p_mask:
+ free_cpumask_var(pinst->cpumask.pcpu);
err_free_serial_wq:
destroy_workqueue(pinst->serial_wq);
err_put_cpus:
@@ -1045,7 +1044,7 @@ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
ps->pinst = pinst;
cpus_read_lock();
- pd = padata_alloc_pd(ps);
+ pd = padata_alloc_pd(ps, -1);
cpus_read_unlock();
if (!pd)
@@ -1094,31 +1093,24 @@ void __init padata_init(void)
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
- padata_cpu_online, NULL);
+ padata_cpu_online, padata_cpu_offline);
if (ret < 0)
goto err;
hp_online = ret;
-
- ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
- NULL, padata_cpu_dead);
- if (ret < 0)
- goto remove_online_state;
#endif
possible_cpus = num_possible_cpus();
padata_works = kmalloc_objs(struct padata_work, possible_cpus);
if (!padata_works)
- goto remove_dead_state;
+ goto remove_online_state;
for (i = 0; i < possible_cpus; ++i)
list_add(&padata_works[i].pw_list, &padata_free_works);
return;
-remove_dead_state:
-#ifdef CONFIG_HOTPLUG_CPU
- cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
remove_online_state:
+#ifdef CONFIG_HOTPLUG_CPU
cpuhp_remove_multi_state(hp_online);
err:
#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index c78600212b6c..20feada5319d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -801,6 +801,8 @@ EXPORT_SYMBOL(panic);
* Documentation/admin-guide/tainted-kernels.rst, including its
* small shell script that prints the TAINT_FLAGS_COUNT bits of
* /proc/sys/kernel/tainted.
+ *
+ * Also, update INIT_TAINT_BUF_MAX below.
*/
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'),
@@ -854,15 +856,54 @@ static void print_tainted_seq(struct seq_buf *s, bool verbose)
}
}
+/* The initial buffer can accommodate all taint flags in verbose
+ * mode, with some headroom. Once the allocator is available, the
+ * exact size is allocated dynamically; the initial buffer remains
+ * as a fallback if allocation fails.
+ *
+ * The verbose taint string currently requires up to 327 characters.
+ */
+#define INIT_TAINT_BUF_MAX 350
+
+static char init_taint_buf[INIT_TAINT_BUF_MAX] __initdata;
+static char *taint_buf __refdata = init_taint_buf;
+static size_t taint_buf_size = INIT_TAINT_BUF_MAX;
+
+static __init int alloc_taint_buf(void)
+{
+ int i;
+ char *buf;
+ size_t size = 0;
+
+ size += sizeof("Tainted: ") - 1;
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ size += 2; /* For ", " */
+ size += 4; /* For "[%c]=" */
+ size += strlen(taint_flags[i].desc);
+ }
+
+ size += 1; /* For NULL terminator */
+
+ buf = kmalloc(size, GFP_KERNEL);
+
+ if (!buf) {
+ panic("Failed to allocate taint string buffer");
+ }
+
+ taint_buf = buf;
+ taint_buf_size = size;
+
+ return 0;
+}
+postcore_initcall(alloc_taint_buf);
+
static const char *_print_tainted(bool verbose)
{
- /* FIXME: what should the size be? */
- static char buf[sizeof(taint_flags)];
struct seq_buf s;
BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
- seq_buf_init(&s, buf, sizeof(buf));
+ seq_buf_init(&s, taint_buf, taint_buf_size);
print_tainted_seq(&s, verbose);
diff --git a/kernel/params.c b/kernel/params.c
index 7188a12dbe86..74d620bc2521 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -161,7 +161,7 @@ static int parse_one(char *param,
char *parse_args(const char *doing,
char *args,
const struct kernel_param *params,
- unsigned num,
+ unsigned int num,
s16 min_level,
s16 max_level,
void *arg, parse_unknown_fn unknown)
@@ -745,15 +745,6 @@ void module_param_sysfs_remove(struct module *mod)
}
#endif
-void destroy_params(const struct kernel_param *params, unsigned num)
-{
- unsigned int i;
-
- for (i = 0; i < num; i++)
- if (params[i].ops->free)
- params[i].ops->free(params[i].arg);
-}
-
struct module_kobject * __init_or_module
lookup_or_create_module_kobject(const char *name)
{
@@ -985,3 +976,21 @@ static int __init param_sysfs_builtin_init(void)
late_initcall(param_sysfs_builtin_init);
#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MODULES
+
+/*
+ * module_destroy_params - free all parameters for one module
+ * @params: module parameters (array)
+ * @num: number of module parameters
+ */
+void module_destroy_params(const struct kernel_param *params, unsigned int num)
+{
+ unsigned int i;
+
+ for (i = 0; i < num; i++)
+ if (params[i].ops->free)
+ params[i].ops->free(params[i].arg);
+}
+
+#endif /* CONFIG_MODULES */
diff --git a/kernel/pid.c b/kernel/pid.c
index 3b96571d0fe6..fd5c2d4aa349 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -128,12 +128,11 @@ void free_pid(struct pid *pid)
* is the reaper wake up the reaper. The reaper
* may be sleeping in zap_pid_ns_processes().
*/
- wake_up_process(ns->child_reaper);
+ wake_up_process(READ_ONCE(ns->child_reaper));
break;
case PIDNS_ADDING:
- /* Handle a fork failure of the first process */
- WARN_ON(ns->child_reaper);
- ns->pid_allocated = 0;
+ /* Only possible if the 1st fork fails */
+ WARN_ON(READ_ONCE(ns->child_reaper));
break;
}
@@ -215,12 +214,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
retval = -EINVAL;
if (tid < 1 || tid >= pid_max[ns->level - i])
goto out_abort;
- /*
- * Also fail if a PID != 1 is requested and
- * no PID 1 exists.
- */
- if (tid != 1 && !tmp->child_reaper)
- goto out_abort;
retval = -EPERM;
if (!checkpoint_restore_ns_capable(tmp->user_ns))
goto out_abort;
@@ -236,6 +229,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
retried_preload = false;
idr_preload(GFP_KERNEL);
spin_lock(&pidmap_lock);
+ /* For the case when the previous attempt to create init failed */
+ if (ns->pid_allocated == PIDNS_ADDING)
+ idr_set_cursor(&ns->idr, 0);
+
for (tmp = ns, i = ns->level; i >= 0;) {
int tid = set_tid[ns->level - i];
@@ -296,9 +293,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
- tmp = tmp->parent;
i--;
retried_preload = false;
+
+ /*
+ * PID 1 (init) must be created first.
+ */
+ if (!READ_ONCE(tmp->child_reaper) && nr != 1) {
+ retval = -EINVAL;
+ goto out_free;
+ }
+
+ tmp = tmp->parent;
}
/*
@@ -311,6 +317,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
*
* This can't be done earlier because we need to preserve other
* error conditions.
+ *
+ * We need this even if copy_process() does the same check. If two
+ * or more tasks from parent namespace try to inject a child into a
+ * dead namespace, one of free_pid() calls from the copy_process()
+ * error path may try to wakeup the possibly freed ns->child_reaper.
*/
retval = -ENOMEM;
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
@@ -338,10 +349,6 @@ out_free:
idr_remove(&upid->ns->idr, upid->nr);
}
- /* On failure to allocate the first pid, reset the state */
- if (ns->pid_allocated == PIDNS_ADDING)
- idr_set_cursor(&ns->idr, 0);
-
spin_unlock(&pidmap_lock);
idr_preload_end();
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e48f5de41361..d36afc58ee1d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -369,15 +369,6 @@ static struct ns_common *pidns_for_children_get(struct task_struct *task)
}
task_unlock(task);
- if (ns) {
- read_lock(&tasklist_lock);
- if (!ns->child_reaper) {
- put_pid_ns(ns);
- ns = NULL;
- }
- read_unlock(&tasklist_lock);
- }
-
return ns ? &ns->ns : NULL;
}
diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c
index 5a611d3950fd..4d4fd29bd2be 100644
--- a/kernel/power/em_netlink.c
+++ b/kernel/power/em_netlink.c
@@ -109,6 +109,8 @@ int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb,
id = nla_get_u32(info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID]);
pd = em_perf_domain_get_by_id(id);
+ if (!pd)
+ return -EINVAL;
__em_nl_get_pd_size(pd, &msg_sz);
msg = genlmsg_new(msg_sz, GFP_KERNEL);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 5f8c9e12eaec..5429e9f19b65 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -40,7 +40,7 @@ void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (WARN_ON(!saved_gfp_count) || --saved_gfp_count)
+ if (!saved_gfp_count || --saved_gfp_count)
return;
gfp_allowed_mask = saved_gfp_mask;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6e1321837c66..a564650734dc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2855,6 +2855,17 @@ int snapshot_write_finalize(struct snapshot_handle *handle)
{
int error;
+ /*
+ * Call snapshot_write_next() to drain any trailing zero pages,
+ * but make sure we're in the data page region first.
+ * This function can return PAGE_SIZE if the kernel was expecting
+ * another copy page. Return -ENODATA in that situation.
+ */
+ if (handle->cur > nr_meta_pages + 1) {
+ error = snapshot_write_next(handle);
+ if (error)
+ return error > 0 ? -ENODATA : error;
+ }
copy_last_highmem_page();
error = hibernate_restore_protect_page(handle->buffer);
/* Do that only if we have loaded the image entirely */
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4401cfe26e5c..be77f3556bd7 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -322,11 +322,14 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
error = snapshot_write_finalize(&data->handle);
if (error)
break;
- if (data->mode != O_WRONLY || !data->frozen ||
- !snapshot_image_loaded(&data->handle)) {
+ if (data->mode != O_WRONLY || !data->frozen) {
error = -EPERM;
break;
}
+ if (!snapshot_image_loaded(&data->handle)) {
+ error = -ENODATA;
+ break;
+ }
error = hibernation_restore(data->platform_support);
break;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 392ec2f75f01..68c17daef8d4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
if (!dead && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, tracer))
dead = do_notify_parent(p, p->exit_signal);
- else if (ignoring_children(tracer->sighand)) {
+ else if (ignoring_children(tracer->sighand) ||
+ p->signal->autoreap) {
__wake_up_parent(p, tracer);
dead = true;
}
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 625d75392647..e078e988773d 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -228,4 +228,15 @@ config RCU_DYNTICKS_TORTURE
This has no value for production and is only for testing.
+config TRIVIAL_PREEMPT_RCU
+ bool "Textbook trivial preemptible RCU in rcutorture"
+ depends on RCU_EXPERT && RCU_TORTURE_TEST
+ default n
+ help
+ This option enables a textbook preemptible RCU that is
+ implemented in rcutorture. Its sole purpose is to validate
+ code used in books, papers, and presentations.
+
+ This has no value for production and is only for testing.
+
endmenu # "RCU Debugging"
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index dc5d614b372c..fa6d30ce73d1 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -502,6 +502,15 @@ do { \
___locked; \
})
+#define raw_spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = raw_spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
@@ -682,4 +691,8 @@ int rcu_stall_notifier_call_chain(unsigned long val, void *v);
static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+#ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+void synchronize_rcu_trivial_preempt(void);
+#endif // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 4ac2b134a983..ac0b1c6b7dae 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -79,12 +79,6 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
* test-end checks, and the pair of calls through pointers.
*/
-#ifdef MODULE
-# define RCUSCALE_SHUTDOWN 0
-#else
-# define RCUSCALE_SHUTDOWN 1
-#endif
-
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
@@ -92,8 +86,8 @@ torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
torture_param(int, minruntime, 0, "Minimum run time (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
- "Shutdown at end of scalability tests.");
+torture_param(int, shutdown_secs, !IS_MODULE(CONFIG_RCU_SCALE_TEST) * 300,
+ "Shutdown at end of scalability tests or at specified timeout (s).");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
@@ -123,7 +117,6 @@ static int nrealreaders;
static int nrealwriters;
static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
-static struct task_struct *shutdown_task;
static u64 **writer_durations;
static bool *writer_done;
@@ -132,7 +125,6 @@ static int *writer_n_durations;
static atomic_t n_rcu_scale_reader_started;
static atomic_t n_rcu_scale_writer_started;
static atomic_t n_rcu_scale_writer_finished;
-static wait_queue_head_t shutdown_wq;
static u64 t_rcu_scale_writer_started;
static u64 t_rcu_scale_writer_finished;
static unsigned long b_rcu_gp_test_started;
@@ -519,6 +511,8 @@ static void rcu_scale_async_cb(struct rcu_head *rhp)
rcu_scale_free(wmbp);
}
+static void rcu_scale_cleanup(void);
+
/*
* RCU scale writer kthread. Repeatedly does a grace period.
*/
@@ -622,9 +616,11 @@ rcu_scale_writer(void *arg)
b_rcu_gp_test_finished =
cur_ops->get_gp_seq();
}
- if (shutdown) {
+ if (shutdown_secs) {
+ writer_tasks[me] = NULL;
smp_mb(); /* Assign before wake. */
- wake_up(&shutdown_wq);
+ rcu_scale_cleanup();
+ kernel_power_off();
}
}
}
@@ -668,8 +664,8 @@ static void
rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n",
- scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown);
+ "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown_secs=%d\n",
+ scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown_secs);
}
/*
@@ -722,6 +718,8 @@ static void kfree_call_rcu(struct rcu_head *rh)
kfree(obj);
}
+static void kfree_scale_cleanup(void);
+
static int
kfree_scale_thread(void *arg)
{
@@ -791,9 +789,11 @@ kfree_scale_thread(void *arg)
rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
PAGES_TO_MB(mem_begin - mem_during));
- if (shutdown) {
+ if (shutdown_secs) {
+ kfree_reader_tasks[me] = NULL;
smp_mb(); /* Assign before wake. */
- wake_up(&shutdown_wq);
+ kfree_scale_cleanup();
+ kernel_power_off();
}
}
@@ -820,22 +820,6 @@ kfree_scale_cleanup(void)
torture_cleanup_end();
}
-/*
- * shutdown kthread. Just waits to be awakened, then shuts down system.
- */
-static int
-kfree_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq,
- atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
-
- smp_mb(); /* Wake before output. */
-
- kfree_scale_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
// Used if doing RCU-kfree'ing via call_rcu().
static unsigned long jiffies_at_lazy_cb;
static struct rcu_head lazy_test1_rh;
@@ -895,13 +879,10 @@ kfree_scale_init(void)
kfree_nrealthreads = compute_real(kfree_nthreads);
/* Start up the kthreads. */
- if (shutdown) {
- init_waitqueue_head(&shutdown_wq);
- firsterr = torture_create_kthread(kfree_scale_shutdown, NULL,
- shutdown_task);
+ if (shutdown_secs) {
+ firsterr = torture_shutdown_init(shutdown_secs, kfree_scale_cleanup);
if (torture_init_error(firsterr))
goto unwind;
- schedule_timeout_uninterruptible(1);
}
pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n",
@@ -1058,20 +1039,6 @@ rcu_scale_cleanup(void)
torture_cleanup_end();
}
-/*
- * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
- * down system.
- */
-static int
-rcu_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
- smp_mb(); /* Wake before output. */
- rcu_scale_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
static int __init
rcu_scale_init(void)
{
@@ -1121,13 +1088,10 @@ rcu_scale_init(void)
/* Start up the kthreads. */
- if (shutdown) {
- init_waitqueue_head(&shutdown_wq);
- firsterr = torture_create_kthread(rcu_scale_shutdown, NULL,
- shutdown_task);
+ if (shutdown_secs) {
+ firsterr = torture_shutdown_init(shutdown_secs, rcu_scale_cleanup);
if (torture_init_error(firsterr))
goto unwind;
- schedule_timeout_uninterruptible(1);
}
reader_tasks = kzalloc_objs(reader_tasks[0], nrealreaders);
if (reader_tasks == NULL) {
@@ -1201,7 +1165,7 @@ rcu_scale_init(void)
unwind:
torture_init_end();
rcu_scale_cleanup();
- if (shutdown) {
+ if (shutdown_secs) {
WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST));
kernel_power_off();
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8a9282a0245c..5f2848b828dc 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -842,7 +842,14 @@ static unsigned long srcu_torture_completed(void)
static void srcu_torture_deferred_free(struct rcu_torture *rp)
{
+ unsigned long flags;
+ bool lockit = jiffies & 0x1;
+
+ if (lockit)
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
+ if (lockit)
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
static void srcu_torture_synchronize(void)
@@ -1061,6 +1068,61 @@ static struct rcu_torture_ops trivial_ops = {
.name = "trivial"
};
+#ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+
+/*
+ * Definitions for trivial CONFIG_PREEMPT=y torture testing. This
+ * implementation does not work well with large numbers of tasks or with
+ * long-term preemption. Either or both get you RCU CPU stall warnings.
+ */
+
+static void rcu_sync_torture_init_trivial_preempt(void)
+{
+ rcu_sync_torture_init();
+ if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) {
+ onoff_interval = 0;
+ shuffle_interval = 0;
+ }
+}
+
+static int rcu_torture_read_lock_trivial_preempt(void)
+{
+ struct task_struct *t = current;
+
+ WRITE_ONCE(t->rcu_trivial_preempt_nesting, t->rcu_trivial_preempt_nesting + 1);
+ smp_mb();
+ return 0;
+}
+
+static void rcu_torture_read_unlock_trivial_preempt(int idx)
+{
+ struct task_struct *t = current;
+
+ smp_store_release(&t->rcu_trivial_preempt_nesting, t->rcu_trivial_preempt_nesting - 1);
+}
+
+static struct rcu_torture_ops trivial_preempt_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init_trivial_preempt,
+ .readlock = rcu_torture_read_lock_trivial_preempt,
+ .read_delay = rcu_read_delay, // just reuse rcu's version.
+ .readunlock = rcu_torture_read_unlock_trivial_preempt,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial_preempt,
+ .exp_sync = synchronize_rcu_trivial_preempt,
+ .irq_capable = 0, // In theory it should be, but let's keep it trivial.
+ .name = "trivial-preempt"
+};
+
+#define TRIVIAL_PREEMPT_OPS &trivial_preempt_ops,
+
+#else // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+
+#define TRIVIAL_PREEMPT_OPS
+
+#endif // #else // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+
#ifdef CONFIG_TASKS_RCU
/*
@@ -4449,7 +4511,7 @@ rcu_torture_init(void)
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
- &trivial_ops,
+ &trivial_ops, TRIVIAL_PREEMPT_OPS
};
if (!torture_init_begin(torture_type, verbose))
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index c158b6a947cd..a2d9d75d88a1 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -92,15 +92,9 @@ torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
torture_param(int, nruns, 30, "Number of experiments to run.");
// Reader delay in nanoseconds, 0 for no delay.
torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
-
-#ifdef MODULE
-# define REFSCALE_SHUTDOWN 0
-#else
-# define REFSCALE_SHUTDOWN 1
-#endif
-
-torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
- "Shutdown at end of scalability tests.");
+// Maximum shutdown delay in seconds, or zero for no shutdown.
+torture_param(int, shutdown_secs, !IS_MODULE(CONFIG_REPRO_TEST) * 300,
+ "Shutdown at end of scalability tests or at specified timeout (s).");
struct reader_task {
struct task_struct *task;
@@ -109,12 +103,8 @@ struct reader_task {
u64 last_duration_ns;
};
-static struct task_struct *shutdown_task;
-static wait_queue_head_t shutdown_wq;
-
static struct task_struct *main_task;
static wait_queue_head_t main_wq;
-static int shutdown_start;
static struct reader_task *reader_tasks;
@@ -1357,6 +1347,8 @@ static u64 process_durations(int n)
return sum;
}
+static void ref_scale_cleanup(void);
+
// The main_func is the main orchestrator, it performs a bunch of
// experiments. For every experiment, it orders all the readers
// involved to start and waits for them to finish the experiment. It
@@ -1443,9 +1435,10 @@ static int main_func(void *arg)
oom_exit:
// This will shutdown everything including us.
- if (shutdown) {
- shutdown_start = 1;
- wake_up(&shutdown_wq);
+ if (shutdown_secs) {
+ main_task = NULL; // Avoid self-kill deadlock.
+ ref_scale_cleanup();
+ kernel_power_off();
}
// Wait for torture to stop us
@@ -1463,8 +1456,8 @@ static void
ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
- verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
+ "--- %s: verbose=%d verbose_batched=%d shutdown_secs=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ verbose, verbose_batched, shutdown_secs, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
}
static void
@@ -1497,19 +1490,6 @@ ref_scale_cleanup(void)
torture_cleanup_end();
}
-// Shutdown kthread. Just waits to be awakened, then shuts down system.
-static int
-ref_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq, shutdown_start);
-
- smp_mb(); // Wake before output.
- ref_scale_cleanup();
- kernel_power_off();
-
- return -EINVAL;
-}
-
static int __init
ref_scale_init(void)
{
@@ -1553,13 +1533,10 @@ ref_scale_init(void)
ref_scale_print_module_parms(cur_ops, "Start of test");
// Shutdown task
- if (shutdown) {
- init_waitqueue_head(&shutdown_wq);
- firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
- shutdown_task);
+ if (shutdown_secs) {
+ firsterr = torture_shutdown_init(shutdown_secs, ref_scale_cleanup);
if (torture_init_error(firsterr))
goto unwind;
- schedule_timeout_uninterruptible(1);
}
// Reader tasks (default to ~75% of online CPUs).
@@ -1604,7 +1581,7 @@ ref_scale_init(void)
unwind:
torture_init_end();
ref_scale_cleanup();
- if (shutdown) {
+ if (shutdown_secs) {
WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
kernel_power_off();
}
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 3450c3751ef7..a2e2d516e51b 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -9,6 +9,7 @@
*/
#include <linux/export.h>
+#include <linux/irq_work.h>
#include <linux/mutex.h>
#include <linux/preempt.h>
#include <linux/rcupdate_wait.h>
@@ -41,6 +42,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
ssp->srcu_idx_max = 0;
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
INIT_LIST_HEAD(&ssp->srcu_work.entry);
+ init_irq_work(&ssp->srcu_irq_work, srcu_tiny_irq_work);
return 0;
}
@@ -84,6 +86,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
+ irq_work_sync(&ssp->srcu_irq_work);
flush_work(&ssp->srcu_work);
WARN_ON(ssp->srcu_gp_running);
WARN_ON(ssp->srcu_gp_waiting);
@@ -177,6 +180,20 @@ void srcu_drive_gp(struct work_struct *wp)
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
+/*
+ * Use an irq_work to defer schedule_work() to avoid acquiring the workqueue
+ * pool->lock while the caller might hold scheduler locks, causing lockdep
+ * splats due to workqueue_init() doing a wakeup.
+ */
+void srcu_tiny_irq_work(struct irq_work *irq_work)
+{
+ struct srcu_struct *ssp;
+
+ ssp = container_of(irq_work, struct srcu_struct, srcu_irq_work);
+ schedule_work(&ssp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_tiny_irq_work);
+
static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
unsigned long cookie;
@@ -189,7 +206,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
if (likely(srcu_init_done))
- schedule_work(&ssp->srcu_work);
+ irq_work_queue(&ssp->srcu_irq_work);
else if (list_empty(&ssp->srcu_work.entry))
list_add(&ssp->srcu_work.entry, &srcu_boot_list);
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index aef8e91ad33e..0d01cd8c4b4a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
+#include <linux/irq_work.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched.h>
#include <linux/smp.h>
@@ -75,44 +76,9 @@ static bool __read_mostly srcu_init_done;
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
static void process_srcu(struct work_struct *work);
+static void srcu_irq_work(struct irq_work *work);
static void srcu_delay_timer(struct timer_list *t);
-/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p) \
-do { \
- spin_lock(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
-
-#define spin_lock_irq_rcu_node(p) \
-do { \
- spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_unlock_irq_rcu_node(p) \
- spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-
-#define spin_lock_irqsave_rcu_node(p, flags) \
-do { \
- spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_trylock_irqsave_rcu_node(p, flags) \
-({ \
- bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- \
- if (___locked) \
- smp_mb__after_unlock_lock(); \
- ___locked; \
-})
-
-#define spin_unlock_irqrestore_rcu_node(p, flags) \
- spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
-
/*
* Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
@@ -131,7 +97,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
*/
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
@@ -186,7 +152,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Each pass through this loop initializes one srcu_node structure. */
srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_init(&ACCESS_PRIVATE(snp, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -242,7 +208,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
if (!ssp->srcu_sup)
return -ENOMEM;
if (!is_static)
- spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
ssp->srcu_sup->node = NULL;
mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
@@ -252,6 +218,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ init_irq_work(&ssp->srcu_sup->irq_work, srcu_irq_work);
ssp->srcu_sup->sda_is_static = is_static;
if (!is_static) {
ssp->sda = alloc_percpu(struct srcu_data);
@@ -263,9 +230,12 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
- if (!init_srcu_struct_nodes(ssp, is_static ? GFP_ATOMIC : GFP_KERNEL))
+ if (!preemptible())
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
+ else if (init_srcu_struct_nodes(ssp, GFP_KERNEL))
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
+ else
goto err_free_sda;
- WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
ssp->srcu_sup->srcu_ssp = ssp;
smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed,
@@ -394,20 +364,20 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
/* Double-checked locking on ->srcu_size-state. */
if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
return;
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
__srcu_transition_to_big(ssp);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
* Check to see if the just-encountered contention event justifies
* a transition to SRCU_SIZE_BIG.
*/
-static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+static void raw_spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
{
unsigned long j;
@@ -429,16 +399,16 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
* to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
* parameter permits this.
*/
-static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+static void raw_spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
{
struct srcu_struct *ssp = sdp->ssp;
- if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ if (raw_spin_trylock_irqsave_rcu_node(sdp, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
- spin_lock_irqsave_check_contention(ssp);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
- spin_lock_irqsave_rcu_node(sdp, *flags);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ raw_spin_lock_irqsave_check_contention(ssp);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
+ raw_spin_lock_irqsave_rcu_node(sdp, *flags);
}
/*
@@ -447,12 +417,12 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
* to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
* parameter permits this.
*/
-static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+static void raw_spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
{
- if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
+ if (raw_spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
- spin_lock_irqsave_check_contention(ssp);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ raw_spin_lock_irqsave_check_contention(ssp);
}
/*
@@ -470,13 +440,13 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
/* The smp_load_acquire() pairs with the smp_store_release(). */
if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
init_srcu_struct_fields(ssp, true);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -742,13 +712,15 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
unsigned long delay;
struct srcu_usage *sup = ssp->srcu_sup;
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
delay = srcu_get_delay(ssp);
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (WARN_ON(!delay))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
+ /* Wait for irq_work to finish first as it may queue a new work. */
+ irq_work_sync(&sup->irq_work);
flush_delayed_work(&sup->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -960,7 +932,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
mutex_lock(&sup->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(sup);
+ raw_spin_lock_irq_rcu_node(sup);
idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
if (srcu_gp_is_expedited(ssp))
@@ -971,7 +943,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
- spin_unlock_irq_rcu_node(sup);
+ raw_spin_unlock_irq_rcu_node(sup);
mutex_unlock(&sup->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
@@ -983,7 +955,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
} else {
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_irq_rcu_node(snp);
+ raw_spin_lock_irq_rcu_node(snp);
cbs = false;
last_lvl = snp >= sup->level[rcu_num_lvls - 1];
if (last_lvl)
@@ -998,7 +970,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
else
mask = snp->srcu_data_have_cbs[idx];
snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq_rcu_node(snp);
+ raw_spin_unlock_irq_rcu_node(snp);
if (cbs)
srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
}
@@ -1008,27 +980,27 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (!(gpseq & counter_wrap_check))
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irq_rcu_node(sdp);
+ raw_spin_lock_irq_rcu_node(sdp);
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
}
/* Callback initiation done, allow grace periods after next. */
mutex_unlock(&sup->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(sup);
+ raw_spin_lock_irq_rcu_node(sup);
gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(sup);
+ raw_spin_unlock_irq_rcu_node(sup);
srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(sup);
+ raw_spin_unlock_irq_rcu_node(sup);
}
/* Transition to big if needed. */
@@ -1059,19 +1031,19 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
(!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
return;
- spin_lock_irqsave_rcu_node(snp, flags);
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
sgsne = snp->srcu_gp_seq_needed_exp;
if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
- spin_unlock_irqrestore_rcu_node(snp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
return;
}
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
}
- spin_lock_irqsave_ssp_contention(ssp, &flags);
+ raw_spin_lock_irqsave_ssp_contention(ssp, &flags);
if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -1109,12 +1081,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
return; /* GP already done and CBs recorded. */
- spin_lock_irqsave_rcu_node(snp, flags);
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
if (snp == snp_leaf && snp_seq == s)
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore_rcu_node(snp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
if (snp == snp_leaf && snp_seq != s) {
srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
return;
@@ -1129,11 +1101,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
sgsne = snp->srcu_gp_seq_needed_exp;
if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
}
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave_ssp_contention(ssp, &flags);
+ raw_spin_lock_irqsave_ssp_contention(ssp, &flags);
if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
@@ -1154,13 +1126,17 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
// it isn't. And it does not have to be. After all, it
// can only be executed during early boot when there is only
// the one boot CPU running with interrupts still disabled.
+ //
+ // Use an irq_work here to avoid acquiring runqueue lock with
+ // srcu rcu_node::lock held. BPF instrument could introduce the
+ // opposite dependency, hence we need to break the possible
+ // locking dependency here.
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &sup->work,
- !!srcu_get_delay(ssp));
+ irq_work_queue(&sup->irq_work);
else if (list_empty(&sup->work.work.entry))
list_add(&sup->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sup, flags);
}
/*
@@ -1172,9 +1148,9 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
unsigned long curdelay;
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = !srcu_get_delay(ssp);
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
@@ -1285,12 +1261,12 @@ static bool srcu_should_expedite(struct srcu_struct *ssp)
return false;
/* If the local srcu_data structure has callbacks, not idle. */
sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ raw_spin_lock_irqsave_rcu_node(sdp, flags);
if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
return false; /* Callbacks already present, so not idle. */
}
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
* No local callbacks, so probabilistically probe global state.
@@ -1350,7 +1326,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_sdp_contention(sdp, &flags);
+ raw_spin_lock_irqsave_sdp_contention(sdp, &flags);
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
/*
@@ -1410,7 +1386,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
sdp->srcu_gp_seq_needed_exp = s;
needexp = true;
}
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
/* Ensure that snp node tree is fully initialized before traversing it */
if (ss_state < SRCU_SIZE_WAIT_BARRIER)
@@ -1522,7 +1498,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
/*
* Make sure that later code is ordered after the SRCU grace
- * period. This pairs with the spin_lock_irq_rcu_node()
+ * period. This pairs with the raw_spin_lock_irq_rcu_node()
* in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
* because the current CPU might have been totally uninvolved with
* (and thus unordered against) that grace period.
@@ -1701,7 +1677,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
*/
static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
{
- spin_lock_irq_rcu_node(sdp);
+ raw_spin_lock_irq_rcu_node(sdp);
atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
@@ -1710,7 +1686,7 @@ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
}
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
}
/**
@@ -1761,7 +1737,7 @@ static void srcu_expedite_current_cb(struct rcu_head *rhp)
bool needcb = false;
struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head);
- spin_lock_irqsave_sdp_contention(sdp, &flags);
+ raw_spin_lock_irqsave_sdp_contention(sdp, &flags);
if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
WARN_ON_ONCE(1);
} else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
@@ -1771,7 +1747,7 @@ static void srcu_expedite_current_cb(struct rcu_head *rhp)
sdp->srcu_ec_state = SRCU_EC_PENDING;
needcb = true;
}
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
// If needed, requeue ourselves as an expedited SRCU callback.
if (needcb)
__call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
@@ -1795,7 +1771,7 @@ void srcu_expedite_current(struct srcu_struct *ssp)
migrate_disable();
sdp = this_cpu_ptr(ssp->sda);
- spin_lock_irqsave_sdp_contention(sdp, &flags);
+ raw_spin_lock_irqsave_sdp_contention(sdp, &flags);
if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
sdp->srcu_ec_state = SRCU_EC_PENDING;
needcb = true;
@@ -1804,7 +1780,7 @@ void srcu_expedite_current(struct srcu_struct *ssp)
} else {
WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
}
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
// If needed, queue an expedited SRCU callback.
if (needcb)
__call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
@@ -1848,17 +1824,17 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return;
}
idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (idx != SRCU_STATE_IDLE) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* Someone else started the grace period. */
@@ -1872,10 +1848,10 @@ static void srcu_advance_state(struct srcu_struct *ssp)
return; /* readers present, retry later. */
}
srcu_flip(ssp);
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
ssp->srcu_sup->srcu_n_exp_nodelay = 0;
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
}
if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
@@ -1913,7 +1889,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
ssp = sdp->ssp;
rcu_cblist_init(&ready_cbs);
- spin_lock_irq_rcu_node(sdp);
+ raw_spin_lock_irq_rcu_node(sdp);
WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
@@ -1924,7 +1900,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
*/
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
return; /* Someone else on the job or nothing to do. */
}
@@ -1932,7 +1908,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
len = ready_cbs.len;
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
debug_rcu_head_unqueue(rhp);
@@ -1947,11 +1923,11 @@ static void srcu_invoke_callbacks(struct work_struct *work)
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
- spin_lock_irq_rcu_node(sdp);
+ raw_spin_lock_irq_rcu_node(sdp);
rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
/* An SRCU barrier or callbacks from previous nesting work pending */
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
@@ -1965,7 +1941,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
@@ -1975,7 +1951,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
/* Outstanding request and no GP. Start one. */
srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (pushgp)
queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
@@ -1995,9 +1971,9 @@ static void process_srcu(struct work_struct *work)
ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = srcu_get_delay(ssp);
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (curdelay) {
WRITE_ONCE(sup->reschedule_count, 0);
} else {
@@ -2015,6 +1991,23 @@ static void process_srcu(struct work_struct *work)
srcu_reschedule(ssp, curdelay);
}
+static void srcu_irq_work(struct irq_work *work)
+{
+ struct srcu_struct *ssp;
+ struct srcu_usage *sup;
+ unsigned long delay;
+ unsigned long flags;
+
+ sup = container_of(work, struct srcu_usage, irq_work);
+ ssp = sup->srcu_ssp;
+
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ delay = srcu_get_delay(ssp);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+
+ queue_delayed_work(rcu_gp_wq, &sup->work, !!delay);
+}
+
void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags,
unsigned long *gp_seq)
{
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 2b55e6acf3c1..48f0d803c8e2 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -291,9 +291,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
shift = ilog2(rcu_task_cpu_ids / lim);
if (((rcu_task_cpu_ids - 1) >> shift) >= lim)
shift++;
- WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
- WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
- smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ rtp->percpu_enqueue_shift = shift;
+ rtp->percpu_dequeue_lim = lim;
+ rtp->percpu_enqueue_lim = lim;
pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n",
rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim),
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index b3337c7231cc..1047b30cd46b 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -379,6 +379,38 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
}
/*
+ * Determine if the bypass queue needs to be flushed based on time and size.
+ * For lazy-only bypass queues, use the lazy flush timeout; otherwise flush
+ * based on jiffy advancement. The flush_faster controls flush aggressiveness.
+ */
+static bool nocb_bypass_needs_flush(struct rcu_data *rdp, long bypass_ncbs,
+ long lazy_ncbs, unsigned long j,
+ bool flush_faster)
+{
+ bool bypass_is_lazy;
+ unsigned long bypass_first;
+ unsigned long flush_timeout;
+ long qhimark_thresh;
+
+ if (!bypass_ncbs)
+ return false;
+
+ qhimark_thresh = flush_faster ? qhimark : 2 * qhimark;
+ if (bypass_ncbs >= qhimark_thresh)
+ return true;
+
+ bypass_first = READ_ONCE(rdp->nocb_bypass_first);
+ bypass_is_lazy = (bypass_ncbs == lazy_ncbs);
+
+ if (bypass_is_lazy)
+ flush_timeout = rcu_get_jiffies_lazy_flush();
+ else
+ flush_timeout = flush_faster ? 0 : 1;
+
+ return time_after(j, bypass_first + flush_timeout);
+}
+
+/*
* See whether it is appropriate to use the ->nocb_bypass list in order
* to control contention on ->nocb_lock. A limited number of direct
* enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
@@ -404,7 +436,8 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
unsigned long cur_gp_seq;
unsigned long j = jiffies;
long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
+ long lazy_len = READ_ONCE(rdp->lazy_len);
+ bool bypass_is_lazy = (ncbs == lazy_len);
lockdep_assert_irqs_disabled();
@@ -456,10 +489,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// If ->nocb_bypass has been used too long or is too full,
// flush ->nocb_bypass to ->cblist.
- if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- (ncbs && bypass_is_lazy &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) ||
- ncbs >= qhimark) {
+ if (nocb_bypass_needs_flush(rdp, ncbs, lazy_len, j, true)) {
rcu_nocb_lock(rdp);
*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
@@ -673,15 +703,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
lazy_ncbs = READ_ONCE(rdp->lazy_len);
- if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) ||
- bypass_ncbs > 2 * qhimark)) {
- flush_bypass = true;
- } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
- flush_bypass = true;
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ flush_bypass = nocb_bypass_needs_flush(rdp, bypass_ncbs, lazy_ncbs, j, false);
+ if (!flush_bypass && !bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
continue; /* No callbacks here, try next. */
}
@@ -1081,30 +1104,6 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
return 0;
}
-int rcu_nocb_cpu_deoffload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- cpus_read_lock();
- mutex_lock(&rcu_state.nocb_mutex);
- if (rcu_rdp_is_offloaded(rdp)) {
- if (!cpu_online(cpu)) {
- ret = rcu_nocb_rdp_deoffload(rdp);
- if (!ret)
- cpumask_clear_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu);
- ret = -EINVAL;
- }
- }
- mutex_unlock(&rcu_state.nocb_mutex);
- cpus_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
-
static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
{
unsigned long flags;
@@ -1149,28 +1148,52 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
return 0;
}
-int rcu_nocb_cpu_offload(int cpu)
+/* Common helper for CPU offload/deoffload operations. */
+static int rcu_nocb_cpu_toggle_offload(int cpu, bool offload)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
cpus_read_lock();
mutex_lock(&rcu_state.nocb_mutex);
- if (!rcu_rdp_is_offloaded(rdp)) {
- if (!cpu_online(cpu)) {
- ret = rcu_nocb_rdp_offload(rdp);
- if (!ret)
- cpumask_set_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu);
- ret = -EINVAL;
- }
+
+ /* Already in desired state, nothing to do. */
+ if (rcu_rdp_is_offloaded(rdp) == offload)
+ goto out_unlock;
+
+ if (cpu_online(cpu)) {
+ pr_info("NOCB: Cannot CB-%soffload online CPU %d\n",
+ offload ? "" : "de", rdp->cpu);
+ ret = -EINVAL;
+ goto out_unlock;
}
+
+ if (offload) {
+ ret = rcu_nocb_rdp_offload(rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ ret = rcu_nocb_rdp_deoffload(rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+
+out_unlock:
mutex_unlock(&rcu_state.nocb_mutex);
cpus_read_unlock();
-
return ret;
}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ return rcu_nocb_cpu_toggle_offload(cpu, false /* de-offload */);
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ return rcu_nocb_cpu_toggle_offload(cpu, true /* offload */);
+}
EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
#ifdef CONFIG_RCU_LAZY
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index d98a5c38e19c..b62735a67884 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -538,6 +538,28 @@ long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool do
EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
#endif
+#if IS_ENABLED(CONFIG_TRIVIAL_PREEMPT_RCU)
+// Trivial and stupid grace-period wait. Defined here so that lockdep
+// kernels can find tasklist_lock.
+void synchronize_rcu_trivial_preempt(void)
+{
+ struct task_struct *g;
+ struct task_struct *t;
+
+ smp_mb(); // Order prior accesses before grace-period start.
+ rcu_read_lock(); // Protect task list.
+ for_each_process_thread(g, t) {
+ if (t == current)
+ continue; // Don't deadlock on ourselves!
+ // Order later rcu_read_lock() on other tasks after QS.
+ while (smp_load_acquire(&t->rcu_trivial_preempt_nesting))
+ continue;
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_trivial_preempt);
+#endif // #if IS_ENABLED(CONFIG_TRIVIAL_PREEMPT_RCU)
+
int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful)
EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers);
diff --git a/kernel/resource.c b/kernel/resource.c
index bb966699da31..d02a53fb95d8 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -727,45 +727,46 @@ static int __find_resource_space(struct resource *root, struct resource *old,
struct resource_constraint *constraint)
{
struct resource *this = root->child;
- struct resource tmp = *new, avail, alloc;
+ struct resource full_avail = *new, avail, alloc;
resource_alignf alignf = constraint->alignf;
- tmp.start = root->start;
+ full_avail.start = root->start;
/*
* Skip past an allocated resource that starts at 0, since the assignment
- * of this->start - 1 to tmp->end below would cause an underflow.
+ * of this->start - 1 to full_avail->end below would cause an underflow.
*/
if (this && this->start == root->start) {
- tmp.start = (this == old) ? old->start : this->end + 1;
+ full_avail.start = (this == old) ? old->start : this->end + 1;
this = this->sibling;
}
for(;;) {
if (this)
- tmp.end = (this == old) ? this->end : this->start - 1;
+ full_avail.end = (this == old) ? this->end : this->start - 1;
else
- tmp.end = root->end;
+ full_avail.end = root->end;
- if (tmp.end < tmp.start)
+ if (full_avail.end < full_avail.start)
goto next;
- resource_clip(&tmp, constraint->min, constraint->max);
- arch_remove_reservations(&tmp);
+ resource_clip(&full_avail, constraint->min, constraint->max);
+ arch_remove_reservations(&full_avail);
/* Check for overflow after ALIGN() */
- avail.start = ALIGN(tmp.start, constraint->align);
- avail.end = tmp.end;
- avail.flags = new->flags & ~IORESOURCE_UNSET;
- if (avail.start >= tmp.start) {
+ avail.start = ALIGN(full_avail.start, constraint->align);
+ avail.end = full_avail.end;
+ avail.flags = new->flags;
+ if (avail.start >= full_avail.start) {
alloc.flags = avail.flags;
if (alignf) {
alloc.start = alignf(constraint->alignf_data,
- &avail, size, constraint->align);
+ &avail, &full_avail,
+ size, constraint->align);
} else {
alloc.start = avail.start;
}
alloc.end = alloc.start + size - 1;
if (alloc.start <= alloc.end &&
- resource_contains(&avail, &alloc)) {
+ __resource_contains_unbound(&full_avail, &alloc)) {
new->start = alloc.start;
new->end = alloc.end;
return 0;
@@ -776,7 +777,7 @@ next: if (!this || this->end == root->end)
break;
if (this != old)
- tmp.start = this->end + 1;
+ full_avail.start = this->end + 1;
this = this->sibling;
}
return -EBUSY;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b0973d19f366..38d3ef540760 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -80,6 +80,7 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/types.h>
+#include <linux/rseq.h>
#include <asm/ptrace.h>
#define CREATE_TRACE_POINTS
@@ -449,13 +450,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
* auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
* size, the required alignment is the original struct rseq alignment.
*
- * In order to be valid, rseq_len is either the original rseq size, or
- * large enough to contain all supported fields, as communicated to
+ * The rseq_len is required to be greater or equal to the original rseq
+ * size. In order to be valid, rseq_len is either the original rseq size,
+ * or large enough to contain all supported fields, as communicated to
* user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
if (rseq_len < ORIG_RSEQ_SIZE ||
(rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
- (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) ||
rseq_len < offsetof(struct rseq, end))))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759777694c78..da20fb6ea25a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -122,6 +122,11 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_throttle_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_replenish_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_update_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_server_start_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_server_stop_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
@@ -687,11 +692,6 @@ bool raw_spin_rq_trylock(struct rq *rq)
}
}
-void raw_spin_rq_unlock(struct rq *rq)
-{
- raw_spin_unlock(rq_lockp(rq));
-}
-
/*
* double_rq_lock - safely lock two runqueues
*/
@@ -872,7 +872,14 @@ void update_rq_clock(struct rq *rq)
* Use HR-timers to deliver accurate preemption points.
*/
-static void hrtick_clear(struct rq *rq)
+enum {
+ HRTICK_SCHED_NONE = 0,
+ HRTICK_SCHED_DEFER = BIT(1),
+ HRTICK_SCHED_START = BIT(2),
+ HRTICK_SCHED_REARM_HRTIMER = BIT(3)
+};
+
+static void __used hrtick_clear(struct rq *rq)
{
if (hrtimer_active(&rq->hrtick_timer))
hrtimer_cancel(&rq->hrtick_timer);
@@ -897,12 +904,24 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void __hrtick_restart(struct rq *rq)
+static inline bool hrtick_needs_rearm(struct hrtimer *timer, ktime_t expires)
+{
+ /*
+ * Queued is false when the timer is not started or currently
+ * running the callback. In both cases, restart. If queued check
+ * whether the expiry time actually changes substantially.
+ */
+ return !hrtimer_is_queued(timer) ||
+ abs(expires - hrtimer_get_expires(timer)) > 5000;
+}
+
+static void hrtick_cond_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = rq->hrtick_time;
- hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
+ if (hrtick_needs_rearm(timer, time))
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -914,7 +933,7 @@ static void __hrtick_start(void *arg)
struct rq_flags rf;
rq_lock(rq, &rf);
- __hrtick_restart(rq);
+ hrtick_cond_restart(rq);
rq_unlock(rq, &rf);
}
@@ -925,7 +944,6 @@ static void __hrtick_start(void *arg)
*/
void hrtick_start(struct rq *rq, u64 delay)
{
- struct hrtimer *timer = &rq->hrtick_timer;
s64 delta;
/*
@@ -933,27 +951,67 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta);
+
+ /*
+ * If this is in the middle of schedule() only note the delay
+ * and let hrtick_schedule_exit() deal with it.
+ */
+ if (rq->hrtick_sched) {
+ rq->hrtick_sched |= HRTICK_SCHED_START;
+ rq->hrtick_delay = delta;
+ return;
+ }
+
+ rq->hrtick_time = ktime_add_ns(ktime_get(), delta);
+ if (!hrtick_needs_rearm(&rq->hrtick_timer, rq->hrtick_time))
+ return;
if (rq == this_rq())
- __hrtick_restart(rq);
+ hrtimer_start(&rq->hrtick_timer, rq->hrtick_time, HRTIMER_MODE_ABS_PINNED_HARD);
else
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
-static void hrtick_rq_init(struct rq *rq)
+static inline void hrtick_schedule_enter(struct rq *rq)
{
- INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
- hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+ rq->hrtick_sched = HRTICK_SCHED_DEFER;
+ if (hrtimer_test_and_clear_rearm_deferred())
+ rq->hrtick_sched |= HRTICK_SCHED_REARM_HRTIMER;
}
-#else /* !CONFIG_SCHED_HRTICK: */
-static inline void hrtick_clear(struct rq *rq)
+
+static inline void hrtick_schedule_exit(struct rq *rq)
{
+ if (rq->hrtick_sched & HRTICK_SCHED_START) {
+ rq->hrtick_time = ktime_add_ns(ktime_get(), rq->hrtick_delay);
+ hrtick_cond_restart(rq);
+ } else if (idle_rq(rq)) {
+ /*
+ * No need for using hrtimer_is_active(). The timer is CPU local
+ * and interrupts are disabled, so the callback cannot be
+ * running and the queued state is valid.
+ */
+ if (hrtimer_is_queued(&rq->hrtick_timer))
+ hrtimer_cancel(&rq->hrtick_timer);
+ }
+
+ if (rq->hrtick_sched & HRTICK_SCHED_REARM_HRTIMER)
+ __hrtimer_rearm_deferred();
+
+ rq->hrtick_sched = HRTICK_SCHED_NONE;
}
-static inline void hrtick_rq_init(struct rq *rq)
+static void hrtick_rq_init(struct rq *rq)
{
+ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
+ rq->hrtick_sched = HRTICK_SCHED_NONE;
+ hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD | HRTIMER_MODE_LAZY_REARM);
}
+#else /* !CONFIG_SCHED_HRTICK: */
+static inline void hrtick_clear(struct rq *rq) { }
+static inline void hrtick_rq_init(struct rq *rq) { }
+static inline void hrtick_schedule_enter(struct rq *rq) { }
+static inline void hrtick_schedule_exit(struct rq *rq) { }
#endif /* !CONFIG_SCHED_HRTICK */
/*
@@ -3847,6 +3905,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
+ int this_cpu = smp_processor_id();
+
/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
if (!scx_allow_ttwu_queue(p))
return false;
@@ -3871,10 +3931,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
*/
- if (!cpus_share_cache(smp_processor_id(), cpu))
+ if (!cpus_share_cache(this_cpu, cpu))
return true;
- if (cpu == smp_processor_id())
+ if (cpu == this_cpu)
return false;
/*
@@ -4721,7 +4781,7 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- return scx_fork(p);
+ return scx_fork(p, kargs);
}
void sched_cancel_fork(struct task_struct *p)
@@ -4729,13 +4789,16 @@ void sched_cancel_fork(struct task_struct *p)
scx_cancel_fork(p);
}
+static void sched_mm_cid_fork(struct task_struct *t);
+
void sched_post_fork(struct task_struct *p)
{
+ sched_mm_cid_fork(p);
uclamp_post_fork(p);
scx_post_fork(p);
}
-unsigned long to_ratio(u64 period, u64 runtime)
+u64 to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
return BW_UNIT;
@@ -4910,6 +4973,34 @@ static inline void finish_task(struct task_struct *prev)
smp_store_release(&prev->on_cpu, 0);
}
+/*
+ * Only called from __schedule context
+ *
+ * There are some cases where we are going to re-do the action
+ * that added the balance callbacks. We may not be in a state
+ * where we can run them, so just zap them so they can be
+ * properly re-added on the next time around. This is similar
+ * handling to running the callbacks, except we just don't call
+ * them.
+ */
+static void zap_balance_callbacks(struct rq *rq)
+{
+ struct balance_callback *next, *head;
+ bool found = false;
+
+ lockdep_assert_rq_held(rq);
+
+ head = rq->balance_callback;
+ while (head) {
+ if (head == &balance_push_callback)
+ found = true;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+ }
+ rq->balance_callback = found ? &balance_push_callback : NULL;
+}
+
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
@@ -5029,6 +5120,7 @@ static inline void finish_lock_switch(struct rq *rq)
*/
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
__balance_callbacks(rq, NULL);
+ hrtick_schedule_exit(rq);
raw_spin_rq_unlock_irq(rq);
}
@@ -5678,7 +5770,7 @@ static void sched_tick_remote(struct work_struct *work)
os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
if (os == TICK_SCHED_REMOTE_RUNNING)
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ queue_delayed_work(system_dfl_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
@@ -5697,7 +5789,7 @@ static void sched_tick_start(int cpu)
if (os == TICK_SCHED_REMOTE_OFFLINE) {
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ queue_delayed_work(system_dfl_wq, &twork->work, HZ);
}
}
@@ -6495,6 +6587,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
if (signal_pending_state(task_state, p)) {
WRITE_ONCE(p->__state, TASK_RUNNING);
*task_state_p = TASK_RUNNING;
+ set_task_blocked_on_waking(p, NULL);
+
return false;
}
@@ -6532,6 +6626,21 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
}
#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void proxy_set_task_cpu(struct task_struct *p, int cpu)
+{
+ unsigned int wake_cpu;
+
+ /*
+ * Since we are enqueuing a blocked task on a cpu it may
+ * not be able to run on, preserve wake_cpu when we
+ * __set_task_cpu so we can return the task to where it
+ * was previously runnable.
+ */
+ wake_cpu = p->wake_cpu;
+ __set_task_cpu(p, cpu);
+ p->wake_cpu = wake_cpu;
+}
+
static inline struct task_struct *proxy_resched_idle(struct rq *rq)
{
put_prev_set_next_task(rq, rq->donor, rq->idle);
@@ -6540,7 +6649,7 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq)
return rq->idle;
}
-static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static bool proxy_deactivate(struct rq *rq, struct task_struct *donor)
{
unsigned long state = READ_ONCE(donor->__state);
@@ -6560,17 +6669,140 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
return try_to_block_task(rq, donor, &state, true);
}
-static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf)
+ __releases(__rq_lockp(rq))
+{
+ /*
+ * The class scheduler may have queued a balance callback
+ * from pick_next_task() called earlier.
+ *
+ * So here we have to zap callbacks before unlocking the rq
+ * as another CPU may jump in and call sched_balance_rq
+ * which can trip the warning in rq_pin_lock() if we
+ * leave callbacks set.
+ *
+ * After we later reaquire the rq lock, we will force __schedule()
+ * to pick_again, so the callbacks will get re-established.
+ */
+ zap_balance_callbacks(rq);
+ rq_unpin_lock(rq, rf);
+ raw_spin_rq_unlock(rq);
+}
+
+static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(__rq_lockp(rq))
+{
+ raw_spin_rq_lock(rq);
+ rq_repin_lock(rq, rf);
+ update_rq_clock(rq);
+}
+
+/*
+ * If the blocked-on relationship crosses CPUs, migrate @p to the
+ * owner's CPU.
+ *
+ * This is because we must respect the CPU affinity of execution
+ * contexts (owner) but we can ignore affinity for scheduling
+ * contexts (@p). So we have to move scheduling contexts towards
+ * potential execution contexts.
+ *
+ * Note: The owner can disappear, but simply migrate to @target_cpu
+ * and leave that CPU to sort things out.
+ */
+static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int target_cpu)
+ __must_hold(__rq_lockp(rq))
+{
+ struct rq *target_rq = cpu_rq(target_cpu);
+
+ lockdep_assert_rq_held(rq);
+ WARN_ON(p == rq->curr);
+ /*
+ * Since we are migrating a blocked donor, it could be rq->donor,
+ * and we want to make sure there aren't any references from this
+ * rq to it before we drop the lock. This avoids another cpu
+ * jumping in and grabbing the rq lock and referencing rq->donor
+ * or cfs_rq->curr, etc after we have migrated it to another cpu,
+ * and before we pick_again in __schedule.
+ *
+ * So call proxy_resched_idle() to drop the rq->donor references
+ * before we release the lock.
+ */
+ proxy_resched_idle(rq);
+
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
+ proxy_set_task_cpu(p, target_cpu);
+
+ proxy_release_rq_lock(rq, rf);
+
+ attach_one_task(target_rq, p);
+
+ proxy_reacquire_rq_lock(rq, rf);
+}
+
+static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p)
+ __must_hold(__rq_lockp(rq))
{
- if (!__proxy_deactivate(rq, donor)) {
+ struct rq *task_rq, *target_rq = NULL;
+ int cpu, wake_flag = WF_TTWU;
+
+ lockdep_assert_rq_held(rq);
+ WARN_ON(p == rq->curr);
+
+ if (p == rq->donor)
+ proxy_resched_idle(rq);
+
+ proxy_release_rq_lock(rq, rf);
+ /*
+ * We drop the rq lock, and re-grab task_rq_lock to get
+ * the pi_lock (needed for select_task_rq) as well.
+ */
+ scoped_guard (task_rq_lock, p) {
+ task_rq = scope.rq;
+
/*
- * XXX: For now, if deactivation failed, set donor
- * as unblocked, as we aren't doing proxy-migrations
- * yet (more logic will be needed then).
+ * Since we let go of the rq lock, the task may have been
+ * woken or migrated to another rq before we got the
+ * task_rq_lock. So re-check we're on the same RQ. If
+ * not, the task has already been migrated and that CPU
+ * will handle any futher migrations.
*/
- donor->blocked_on = NULL;
+ if (task_rq != rq)
+ break;
+
+ /*
+ * Similarly, if we've been dequeued, someone else will
+ * wake us
+ */
+ if (!task_on_rq_queued(p))
+ break;
+
+ /*
+ * Since we should only be calling here from __schedule()
+ * -> find_proxy_task(), no one else should have
+ * assigned current out from under us. But check and warn
+ * if we see this, then bail.
+ */
+ if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) {
+ WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n",
+ __func__, cpu_of(task_rq),
+ p->comm, p->pid, p->on_cpu);
+ break;
+ }
+
+ update_rq_clock(task_rq);
+ deactivate_task(task_rq, p, DEQUEUE_NOCLOCK);
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
+ set_task_cpu(p, cpu);
+ target_rq = cpu_rq(cpu);
+ clear_task_blocked_on(p, NULL);
}
- return NULL;
+
+ if (target_rq)
+ attach_one_task(target_rq, p);
+
+ proxy_reacquire_rq_lock(rq, rf);
}
/*
@@ -6584,31 +6816,41 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
* p->pi_lock
* rq->lock
* mutex->wait_lock
+ * p->blocked_lock
*
* Returns the task that is going to be used as execution context (the one
* that is actually going to be run on cpu_of(rq)).
*/
static struct task_struct *
find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+ __must_hold(__rq_lockp(rq))
{
struct task_struct *owner = NULL;
+ bool curr_in_chain = false;
int this_cpu = cpu_of(rq);
struct task_struct *p;
struct mutex *mutex;
+ int owner_cpu;
/* Follow blocked_on chain. */
- for (p = donor; task_is_blocked(p); p = owner) {
- mutex = p->blocked_on;
- /* Something changed in the chain, so pick again */
- if (!mutex)
- return NULL;
+ for (p = donor; (mutex = p->blocked_on); p = owner) {
+ /* if its PROXY_WAKING, do return migration or run if current */
+ if (mutex == PROXY_WAKING) {
+ if (task_current(rq, p)) {
+ clear_task_blocked_on(p, PROXY_WAKING);
+ return p;
+ }
+ goto force_return;
+ }
+
/*
* By taking mutex->wait_lock we hold off concurrent mutex_unlock()
* and ensure @owner sticks around.
*/
guard(raw_spinlock)(&mutex->wait_lock);
+ guard(raw_spinlock)(&p->blocked_lock);
- /* Check again that p is blocked with wait_lock held */
+ /* Check again that p is blocked with blocked_lock held */
if (mutex != __get_task_blocked_on(p)) {
/*
* Something changed in the blocked_on chain and
@@ -6619,20 +6861,39 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
return NULL;
}
+ if (task_current(rq, p))
+ curr_in_chain = true;
+
owner = __mutex_owner(mutex);
if (!owner) {
- __clear_task_blocked_on(p, mutex);
- return p;
+ /*
+ * If there is no owner, either clear blocked_on
+ * and return p (if it is current and safe to
+ * just run on this rq), or return-migrate the task.
+ */
+ if (task_current(rq, p)) {
+ __clear_task_blocked_on(p, NULL);
+ return p;
+ }
+ goto force_return;
}
if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
/* XXX Don't handle blocked owners/delayed dequeue yet */
- return proxy_deactivate(rq, donor);
+ if (curr_in_chain)
+ return proxy_resched_idle(rq);
+ goto deactivate;
}
- if (task_cpu(owner) != this_cpu) {
- /* XXX Don't handle migrations yet */
- return proxy_deactivate(rq, donor);
+ owner_cpu = task_cpu(owner);
+ if (owner_cpu != this_cpu) {
+ /*
+ * @owner can disappear, simply migrate to @owner_cpu
+ * and leave that CPU to sort things out.
+ */
+ if (curr_in_chain)
+ return proxy_resched_idle(rq);
+ goto migrate_task;
}
if (task_on_rq_migrating(owner)) {
@@ -6689,9 +6950,20 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
* guarantee its existence, as per ttwu_remote().
*/
}
-
WARN_ON_ONCE(owner && !owner->on_rq);
return owner;
+
+deactivate:
+ if (proxy_deactivate(rq, donor))
+ return NULL;
+ /* If deactivate fails, force return */
+ p = donor;
+force_return:
+ proxy_force_return(rq, rf, p);
+ return NULL;
+migrate_task:
+ proxy_migrate_task(rq, rf, p, owner_cpu);
+ return NULL;
}
#else /* SCHED_PROXY_EXEC */
static struct task_struct *
@@ -6702,23 +6974,6 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
}
#endif /* SCHED_PROXY_EXEC */
-static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
-{
- if (!sched_proxy_exec())
- return;
- /*
- * pick_next_task() calls set_next_task() on the chosen task
- * at some point, which ensures it is not push/pullable.
- * However, the chosen/donor task *and* the mutex owner form an
- * atomic pair wrt push/pull.
- *
- * Make sure owner we run is not pushable. Unfortunately we can
- * only deal with that by means of a dequeue/enqueue cycle. :-/
- */
- dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
- enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
-}
-
/*
* __schedule() is the main scheduler function.
*
@@ -6782,9 +7037,6 @@ static void __sched notrace __schedule(int sched_mode)
schedule_debug(prev, preempt);
- if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
- hrtick_clear(rq);
-
klp_sched_try_switch(prev);
local_irq_disable();
@@ -6811,6 +7063,8 @@ static void __sched notrace __schedule(int sched_mode)
rq_lock(rq, &rf);
smp_mb__after_spinlock();
+ hrtick_schedule_enter(rq);
+
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
@@ -6830,6 +7084,7 @@ static void __sched notrace __schedule(int sched_mode)
/* SCX must consult the BPF scheduler to tell if rq is empty */
if (!rq->nr_running && !scx_enabled()) {
next = prev;
+ rq->next_class = &idle_sched_class;
goto picked;
}
} else if (!preempt && prev_state) {
@@ -6845,16 +7100,45 @@ static void __sched notrace __schedule(int sched_mode)
}
pick_again:
+ assert_balance_callbacks_empty(rq);
next = pick_next_task(rq, rq->donor, &rf);
- rq_set_donor(rq, next);
rq->next_class = next->sched_class;
- if (unlikely(task_is_blocked(next))) {
- next = find_proxy_task(rq, next, &rf);
- if (!next)
- goto pick_again;
- if (next == rq->idle)
- goto keep_resched;
+ if (sched_proxy_exec()) {
+ struct task_struct *prev_donor = rq->donor;
+
+ rq_set_donor(rq, next);
+ if (unlikely(next->blocked_on)) {
+ next = find_proxy_task(rq, next, &rf);
+ if (!next) {
+ zap_balance_callbacks(rq);
+ goto pick_again;
+ }
+ if (next == rq->idle) {
+ zap_balance_callbacks(rq);
+ goto keep_resched;
+ }
+ }
+ if (rq->donor == prev_donor && prev != next) {
+ struct task_struct *donor = rq->donor;
+ /*
+ * When transitioning like:
+ *
+ * prev next
+ * donor: B B
+ * curr: A B or C
+ *
+ * then put_prev_set_next_task() will not have done
+ * anything, since B == B. However, A might have
+ * missed a RT/DL balance opportunity due to being
+ * on_cpu.
+ */
+ donor->sched_class->put_prev_task(rq, donor, donor);
+ donor->sched_class->set_next_task(rq, donor, true);
+ }
+ } else {
+ rq_set_donor(rq, next);
}
+
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
@@ -6870,9 +7154,6 @@ keep_resched:
*/
RCU_INIT_POINTER(rq->curr, next);
- if (!task_current_donor(rq, next))
- proxy_tag_curr(rq, next);
-
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -6906,12 +7187,9 @@ keep_resched:
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
- /* In case next was already curr but just got blocked_donor */
- if (!task_current_donor(rq, next))
- proxy_tag_curr(rq, next);
-
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq, NULL);
+ hrtick_schedule_exit(rq);
raw_spin_rq_unlock_irq(rq);
}
trace_sched_exit_tp(is_switch);
@@ -10616,13 +10894,10 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc
}
}
-static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
+static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
{
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
guard(task_rq_lock)(t);
- /* If the task is not active it is not in the users count */
- if (!t->mm_cid.active)
- return false;
if (cid_on_task(t->mm_cid.cid)) {
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
if (task_rq(t)->curr == t)
@@ -10630,69 +10905,43 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
else
mm_unset_cid_on_task(t);
}
- return true;
}
-static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
+static void mm_cid_fixup_tasks_to_cpus(void)
{
- struct task_struct *p, *t;
- unsigned int users;
-
- /*
- * This can obviously race with a concurrent affinity change, which
- * increases the number of allowed CPUs for this mm, but that does
- * not affect the mode and only changes the CID constraints. A
- * possible switch back to per task mode happens either in the
- * deferred handler function or in the next fork()/exit().
- *
- * The caller has already transferred. The newly incoming task is
- * already accounted for, but not yet visible.
- */
- users = mm->mm_cid.users - 2;
- if (!users)
- return;
-
- guard(rcu)();
- for_other_threads(current, t) {
- if (mm_cid_fixup_task_to_cpu(t, mm))
- users--;
- }
+ struct mm_struct *mm = current->mm;
+ struct task_struct *t;
- if (!users)
- return;
+ lockdep_assert_held(&mm->mm_cid.mutex);
- /* Happens only for VM_CLONE processes. */
- for_each_process_thread(p, t) {
- if (t == current || t->mm != mm)
- continue;
- if (mm_cid_fixup_task_to_cpu(t, mm)) {
- if (--users == 0)
- return;
- }
+ hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) {
+ /* Current has already transferred before invoking the fixup. */
+ if (t != current)
+ mm_cid_fixup_task_to_cpu(t, mm);
}
-}
-
-static void mm_cid_fixup_tasks_to_cpus(void)
-{
- struct mm_struct *mm = current->mm;
- mm_cid_do_fixup_tasks_to_cpus(mm);
mm_cid_complete_transit(mm, MM_CID_ONCPU);
}
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
+ lockdep_assert_held(&mm->mm_cid.lock);
+
t->mm_cid.active = 1;
+ hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list);
mm->mm_cid.users++;
return mm_update_max_cids(mm);
}
-void sched_mm_cid_fork(struct task_struct *t)
+static void sched_mm_cid_fork(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
bool percpu;
- WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+ if (!mm)
+ return;
+
+ WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET);
guard(mutex)(&mm->mm_cid.mutex);
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
@@ -10731,12 +10980,13 @@ void sched_mm_cid_fork(struct task_struct *t)
static bool sched_mm_cid_remove_user(struct task_struct *t)
{
+ lockdep_assert_held(&t->mm->mm_cid.lock);
+
t->mm_cid.active = 0;
- scoped_guard(preempt) {
- /* Clear the transition bit */
- t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
- mm_unset_cid_on_task(t);
- }
+ /* Clear the transition bit */
+ t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+ mm_unset_cid_on_task(t);
+ hlist_del_init(&t->mm_cid.node);
t->mm->mm_cid.users--;
return mm_update_max_cids(t->mm);
}
@@ -10879,11 +11129,13 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
mutex_init(&mm->mm_cid.mutex);
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ INIT_HLIST_HEAD(&mm->mm_cid.user_list);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
#endif /* !CONFIG_SCHED_MM_CID */
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 153232dd8276..ae9fd211cec1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long prev_util = sg_cpu->util;
unsigned long max_cap;
@@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
- cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+ cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min,
sg_cpu->util, max_cap);
- sg_cpu->sg_policy->last_freq_update_time = time;
+ sg_policy->last_freq_update_time = time;
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d08b00429323..edca7849b165 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -18,6 +18,7 @@
#include <linux/cpuset.h>
#include <linux/sched/clock.h>
+#include <linux/sched/deadline.h>
#include <uapi/linux/sched/types.h>
#include "sched.h"
#include "pelt.h"
@@ -57,17 +58,6 @@ static int __init sched_dl_sysctl_init(void)
late_initcall(sched_dl_sysctl_init);
#endif /* CONFIG_SYSCTL */
-static bool dl_server(struct sched_dl_entity *dl_se)
-{
- return dl_se->dl_server;
-}
-
-static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
-{
- BUG_ON(dl_server(dl_se));
- return container_of(dl_se, struct task_struct, dl);
-}
-
static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
{
return container_of(dl_rq, struct rq, dl);
@@ -115,6 +105,19 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
}
#endif /* !CONFIG_RT_MUTEXES */
+static inline u8 dl_get_type(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ if (!dl_server(dl_se))
+ return DL_TASK;
+ if (dl_se == &rq->fair_server)
+ return DL_SERVER_FAIR;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (dl_se == &rq->ext_server)
+ return DL_SERVER_EXT;
+#endif
+ return DL_OTHER;
+}
+
static inline struct dl_bw *dl_bw_of(int i)
{
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
@@ -733,6 +736,7 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
dl_se->dl_throttled = 1;
dl_se->dl_defer_armed = 1;
}
+ trace_sched_dl_replenish_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq));
}
/*
@@ -848,6 +852,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
if (dl_se->dl_throttled)
dl_se->dl_throttled = 0;
+ trace_sched_dl_replenish_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq));
+
/*
* If this is the replenishment of a deferred reservation,
* clear the flag and return.
@@ -975,22 +981,6 @@ update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
}
/*
- * Regarding the deadline, a task with implicit deadline has a relative
- * deadline == relative period. A task with constrained deadline has a
- * relative deadline <= relative period.
- *
- * We support constrained deadline tasks. However, there are some restrictions
- * applied only for tasks which do not have an implicit deadline. See
- * update_dl_entity() to know more about such restrictions.
- *
- * The dl_is_implicit() returns true if the task has an implicit deadline.
- */
-static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
-{
- return dl_se->dl_deadline == dl_se->dl_period;
-}
-
-/*
* When a deadline entity is placed in the runqueue, its runtime and deadline
* might need to be updated. This is done by a CBS wake up rule. There are two
* different rules: 1) the original CBS; and 2) the Revisited CBS.
@@ -1027,7 +1017,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, rq_clock(rq))) {
- if (unlikely(!dl_is_implicit(dl_se) &&
+ if (unlikely((!dl_is_implicit(dl_se) || dl_se->dl_defer) &&
!dl_time_before(dl_se->deadline, rq_clock(rq)) &&
!is_dl_boosted(dl_se))) {
update_dl_revised_wakeup(dl_se, rq);
@@ -1097,7 +1087,7 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
act = ns_to_ktime(dl_next_period(dl_se));
}
- now = hrtimer_cb_get_time(timer);
+ now = ktime_get();
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -1345,6 +1335,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
return;
+ trace_sched_dl_throttle_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq));
dl_se->dl_throttled = 1;
if (dl_se->runtime > 0)
dl_se->runtime = 0;
@@ -1508,6 +1499,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
+ trace_sched_dl_throttle_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq));
dl_se->dl_throttled = 1;
/* If requested, inform the user about runtime overruns. */
@@ -1532,6 +1524,8 @@ throttle:
if (!is_leftmost(dl_se, &rq->dl))
resched_curr(rq);
+ } else {
+ trace_sched_dl_update_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq));
}
/*
@@ -1810,6 +1804,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
if (WARN_ON_ONCE(!cpu_online(cpu_of(rq))))
return;
+ trace_sched_dl_server_start_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq));
dl_se->dl_server_active = 1;
enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
@@ -1821,6 +1816,8 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
if (!dl_server(dl_se) || !dl_server_active(dl_se))
return;
+ trace_sched_dl_server_stop_tp(dl_se, cpu_of(dl_se->rq),
+ dl_get_type(dl_se, dl_se->rq));
dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
hrtimer_try_to_cancel(&dl_se->dl_timer);
dl_se->dl_defer_armed = 0;
@@ -2142,10 +2139,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
int flags)
{
struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
if (!schedstat_enabled())
return;
+ if (p != rq->curr)
+ update_stats_wait_end_dl(dl_rq, dl_se);
+
if ((flags & DEQUEUE_SLEEP)) {
unsigned int state;
@@ -2801,12 +2802,26 @@ static int find_later_rq(struct task_struct *task)
static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
{
- struct task_struct *p;
+ struct task_struct *i, *p = NULL;
+ struct rb_node *next_node;
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+ next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+ while (next_node) {
+ i = __node_2_pdl(next_node);
+ /* make sure task isn't on_cpu (possible with proxy-exec) */
+ if (!task_on_cpu(rq, i)) {
+ p = i;
+ break;
+ }
+
+ next_node = rb_next(next_node);
+ }
+
+ if (!p)
+ return NULL;
WARN_ON_ONCE(rq->cpu != task_cpu(p));
WARN_ON_ONCE(task_current(rq, p));
@@ -3613,13 +3628,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
-void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
{
struct sched_dl_entity *dl_se = &p->dl;
+ struct rq *rq = task_rq(p);
+ u64 adj_deadline;
attr->sched_priority = p->rt_priority;
- attr->sched_runtime = dl_se->dl_runtime;
- attr->sched_deadline = dl_se->dl_deadline;
+ if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) {
+ guard(raw_spinlock_irq)(&rq->__lock);
+ update_rq_clock(rq);
+ if (task_current(rq, p))
+ update_curr_dl(rq);
+
+ attr->sched_runtime = dl_se->runtime;
+ adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns();
+ attr->sched_deadline = adj_deadline;
+ } else {
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ }
attr->sched_period = dl_se->dl_period;
attr->sched_flags &= ~SCHED_DL_FLAGS;
attr->sched_flags |= dl_se->flags;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b24f40f05019..74c1617cf652 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,6 +8,7 @@
*/
#include <linux/debugfs.h>
#include <linux/nmi.h>
+#include <linux/log2.h>
#include "sched.h"
/*
@@ -901,10 +902,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
+ s64 zero_vruntime = -1, sum_w_vruntime = -1;
+ u64 avruntime;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
+ unsigned int sum_shift;
unsigned long flags;
+ u64 sum_weight;
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
@@ -925,6 +930,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
if (last)
right_vruntime = last->vruntime;
zero_vruntime = cfs_rq->zero_vruntime;
+ sum_w_vruntime = cfs_rq->sum_w_vruntime;
+ sum_weight = cfs_rq->sum_weight;
+ sum_shift = cfs_rq->sum_shift;
+ avruntime = avg_vruntime(cfs_rq);
raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
@@ -933,8 +942,13 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
SPLIT_NS(zero_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
+ sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
+ SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight",
+ sum_weight);
+ SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
- SPLIT_NS(avg_vruntime(cfs_rq)));
+ SPLIT_NS(avruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 62b1f3ac5630..e426e27b6794 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,6 +9,8 @@
#include <linux/btf_ids.h>
#include "ext_idle.h"
+static DEFINE_RAW_SPINLOCK(scx_sched_lock);
+
/*
* NOTE: sched_ext is in the process of growing multiple scheduler support and
* scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -17,7 +19,23 @@
* are used as temporary markers to indicate that the dereferences need to be
* updated to point to the associated scheduler instances rather than scx_root.
*/
-static struct scx_sched __rcu *scx_root;
+struct scx_sched __rcu *scx_root;
+
+/*
+ * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock.
+ * Readers can hold either or rcu_read_lock().
+ */
+static LIST_HEAD(scx_sched_all);
+
+#ifdef CONFIG_EXT_SUB_SCHED
+static const struct rhashtable_params scx_sched_hash_params = {
+ .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id),
+ .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id),
+ .head_offset = offsetof(struct scx_sched, hash_node),
+};
+
+static struct rhashtable scx_sched_hash;
+#endif
/*
* During exit, a task may schedule after losing its PIDs. When disabling the
@@ -33,37 +51,39 @@ static DEFINE_MUTEX(scx_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
-static int scx_bypass_depth;
+static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
static cpumask_var_t scx_bypass_lb_donee_cpumask;
static cpumask_var_t scx_bypass_lb_resched_cpumask;
-static bool scx_aborting;
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
-/*
- * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass
- * depth on enable failure. Will be removed when bypass depth is moved into the
- * sched instance.
- */
-static bool scx_bypassed_for_enable;
-
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+#ifdef CONFIG_EXT_SUB_SCHED
/*
- * A monotically increasing sequence number that is incremented every time a
- * scheduler is enabled. This can be used by to check if any custom sched_ext
+ * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
+ * tasks for the sub-sched being enabled. Use a global variable instead of a
+ * per-task field as all enables are serialized.
+ */
+static struct scx_sched *scx_enabling_sub_sched;
+#else
+#define scx_enabling_sub_sched (struct scx_sched *)NULL
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+/*
+ * A monotonically increasing sequence number that is incremented every time a
+ * scheduler is enabled. This can be used to check if any custom sched_ext
* scheduler has ever been used in the system.
*/
static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
- * The maximum amount of time in jiffies that a task may be runnable without
- * being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_error().
+ * Watchdog interval. All scx_sched's share a single watchdog timer and the
+ * interval is half of the shortest sch->watchdog_timeout.
*/
-static unsigned long scx_watchdog_timeout;
+static unsigned long scx_watchdog_interval;
/*
* The last time the delayed work was run. This delayed work relies on
@@ -106,25 +126,6 @@ static const struct rhashtable_params dsq_hash_params = {
static LLIST_HEAD(dsqs_to_free);
-/* dispatch buf */
-struct scx_dsp_buf_ent {
- struct task_struct *task;
- unsigned long qseq;
- u64 dsq_id;
- u64 enq_flags;
-};
-
-static u32 scx_dsp_max_batch;
-
-struct scx_dsp_ctx {
- struct rq *rq;
- u32 cursor;
- u32 nr_tasks;
- struct scx_dsp_buf_ent buf[];
-};
-
-static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
-
/* string formatting from BPF */
struct scx_bstr_buf {
u64 data[MAX_BPRINTF_VARARGS];
@@ -135,6 +136,8 @@ static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
static struct scx_bstr_buf scx_exit_bstr_buf;
/* ops debug dump */
+static DEFINE_RAW_SPINLOCK(scx_dump_lock);
+
struct scx_dump_data {
s32 cpu;
bool first;
@@ -156,7 +159,6 @@ static struct kset *scx_kset;
* There usually is no reason to modify these as normal scheduler operation
* shouldn't be affected by them. The knobs are primarily for debugging.
*/
-static u64 scx_slice_dfl = SCX_SLICE_DFL;
static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
@@ -193,10 +195,10 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
-static void process_ddsp_deferred_locals(struct rq *rq);
+static void run_deferred(struct rq *rq);
static bool task_dead_and_done(struct task_struct *p);
-static u32 reenq_local(struct rq *rq);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
+static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
@@ -227,28 +229,109 @@ static long jiffies_delta_msecs(unsigned long at, unsigned long now)
return -(long)jiffies_to_msecs(now - at);
}
-/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
-static u32 higher_bits(u32 flags)
+static bool u32_before(u32 a, u32 b)
+{
+ return (s32)(a - b) < 0;
+}
+
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_parent - Find the parent sched
+ * @sch: sched to find the parent of
+ *
+ * Returns the parent scheduler or %NULL if @sch is root.
+ */
+static struct scx_sched *scx_parent(struct scx_sched *sch)
{
- return ~((1 << fls(flags)) - 1);
+ if (sch->level)
+ return sch->ancestors[sch->level - 1];
+ else
+ return NULL;
}
-/* return the mask with only the highest bit set */
-static u32 highest_bit(u32 flags)
+/**
+ * scx_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: sched whose descendants to walk
+ *
+ * To be used by scx_for_each_descendant_pre(). Find the next descendant to
+ * visit for pre-order traversal of @root's descendants. @root is included in
+ * the iteration and the first node to be visited.
+ */
+static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos,
+ struct scx_sched *root)
{
- int bit = fls(flags);
- return ((u64)1 << bit) >> 1;
+ struct scx_sched *next;
+
+ lockdep_assert(lockdep_is_held(&scx_enable_mutex) ||
+ lockdep_is_held(&scx_sched_lock));
+
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
+
+ /* visit the first child if exists */
+ next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ if (!list_is_last(&pos->sibling, &scx_parent(pos)->children))
+ return list_next_entry(pos, sibling);
+ pos = scx_parent(pos);
+ }
+
+ return NULL;
}
-static bool u32_before(u32 a, u32 b)
+static struct scx_sched *scx_find_sub_sched(u64 cgroup_id)
{
- return (s32)(a - b) < 0;
+ return rhashtable_lookup(&scx_sched_hash, &cgroup_id,
+ scx_sched_hash_params);
}
-static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
- struct task_struct *p)
+static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
{
- return sch->global_dsqs[cpu_to_node(task_cpu(p))];
+ rcu_assign_pointer(p->scx.sched, sch);
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
+static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; }
+static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+/**
+ * scx_is_descendant - Test whether sched is a descendant
+ * @sch: sched to test
+ * @ancestor: ancestor sched to test against
+ *
+ * Test whether @sch is a descendant of @ancestor.
+ */
+static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor)
+{
+ if (sch->level < ancestor->level)
+ return false;
+ return sch->ancestors[ancestor->level] == ancestor;
+}
+
+/**
+ * scx_for_each_descendant_pre - pre-order walk of a sched's descendants
+ * @pos: iteration cursor
+ * @root: sched to walk the descendants of
+ *
+ * Walk @root's descendants. @root is included in the iteration and the first
+ * node to be visited. Must be called with either scx_enable_mutex or
+ * scx_sched_lock held.
+ */
+#define scx_for_each_descendant_pre(pos, root) \
+ for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \
+ (pos) = scx_next_descendant_pre((pos), (root)))
+
+static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu)
+{
+ return &sch->pnode[cpu_to_node(cpu)]->global_dsq;
}
static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
@@ -264,28 +347,106 @@ static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
return __setscheduler_class(p->policy, p->prio);
}
-/*
- * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
- * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
- * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
- * whether it's running from an allowed context.
+static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu)
+{
+ return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
+}
+
+static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu)
+{
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * If @sch is a sub-sched which is bypassing, its tasks should go into
+ * the bypass DSQs of the nearest ancestor which is not bypassing. The
+ * not-bypassing ancestor is responsible for scheduling all tasks from
+ * bypassing sub-trees. If all ancestors including root are bypassing,
+ * all tasks should go to the root's bypass DSQs.
+ *
+ * Whenever a sched starts bypassing, all runnable tasks in its subtree
+ * are re-enqueued after scx_bypassing() is turned on, guaranteeing that
+ * all tasks are transferred to the right DSQs.
+ */
+ while (scx_parent(sch) && scx_bypassing(sch, cpu))
+ sch = scx_parent(sch);
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+ return bypass_dsq(sch, cpu);
+}
+
+/**
+ * bypass_dsp_enabled - Check if bypass dispatch path is enabled
+ * @sch: scheduler to check
+ *
+ * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled
+ * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors
+ * are bypassing. In the former case, the ancestor is not itself bypassing but
+ * its bypass DSQs will be populated with bypassed tasks from descendants. Thus,
+ * the ancestor's bypass dispatch path must be active even though its own
+ * bypass_depth remains zero.
*
- * @mask is constant, always inline to cull the mask calculations.
+ * This function checks bypass_dsp_enable_depth which is managed separately from
+ * bypass_depth to enable this decoupling. See enable_bypass_dsp() and
+ * disable_bypass_dsp().
*/
-static __always_inline void scx_kf_allow(u32 mask)
+static bool bypass_dsp_enabled(struct scx_sched *sch)
{
- /* nesting is allowed only in increasing scx_kf_mask order */
- WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
- "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
- current->scx.kf_mask, mask);
- current->scx.kf_mask |= mask;
- barrier();
+ return unlikely(atomic_read(&sch->bypass_dsp_enable_depth));
}
-static void scx_kf_disallow(u32 mask)
+/**
+ * rq_is_open - Is the rq available for immediate execution of an SCX task?
+ * @rq: rq to test
+ * @enq_flags: optional %SCX_ENQ_* of the task being enqueued
+ *
+ * Returns %true if @rq is currently open for executing an SCX task. After a
+ * %false return, @rq is guaranteed to invoke SCX dispatch path at least once
+ * before going to idle and not inserting a task into @rq's local DSQ after a
+ * %false return doesn't cause @rq to stall.
+ */
+static bool rq_is_open(struct rq *rq, u64 enq_flags)
{
- barrier();
- current->scx.kf_mask &= ~mask;
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * A higher-priority class task is either running or in the process of
+ * waking up on @rq.
+ */
+ if (sched_class_above(rq->next_class, &ext_sched_class))
+ return false;
+
+ /*
+ * @rq is either in transition to or in idle and there is no
+ * higher-priority class task waking up on it.
+ */
+ if (sched_class_above(&ext_sched_class, rq->next_class))
+ return true;
+
+ /*
+ * @rq is either picking, in transition to, or running an SCX task.
+ */
+
+ /*
+ * If we're in the dispatch path holding rq lock, $curr may or may not
+ * be ready depending on whether the on-going dispatch decides to extend
+ * $curr's slice. We say yes here and resolve it at the end of dispatch.
+ * See balance_one().
+ */
+ if (rq->scx.flags & SCX_RQ_IN_BALANCE)
+ return true;
+
+ /*
+ * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch,
+ * so allow it to avoid spuriously triggering reenq on a combined
+ * PREEMPT|IMMED insertion.
+ */
+ if (enq_flags & SCX_ENQ_PREEMPT)
+ return true;
+
+ /*
+ * @rq is either in transition to or running an SCX task and can't go
+ * idle without another SCX dispatch cycle.
+ */
+ return false;
}
/*
@@ -308,119 +469,77 @@ static inline void update_locked_rq(struct rq *rq)
__this_cpu_write(scx_locked_rq_state, rq);
}
-#define SCX_CALL_OP(sch, mask, op, rq, args...) \
+#define SCX_CALL_OP(sch, op, rq, args...) \
do { \
if (rq) \
update_locked_rq(rq); \
- if (mask) { \
- scx_kf_allow(mask); \
- (sch)->ops.op(args); \
- scx_kf_disallow(mask); \
- } else { \
- (sch)->ops.op(args); \
- } \
+ (sch)->ops.op(args); \
if (rq) \
update_locked_rq(NULL); \
} while (0)
-#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \
+#define SCX_CALL_OP_RET(sch, op, rq, args...) \
({ \
__typeof__((sch)->ops.op(args)) __ret; \
\
if (rq) \
update_locked_rq(rq); \
- if (mask) { \
- scx_kf_allow(mask); \
- __ret = (sch)->ops.op(args); \
- scx_kf_disallow(mask); \
- } else { \
- __ret = (sch)->ops.op(args); \
- } \
+ __ret = (sch)->ops.op(args); \
if (rq) \
update_locked_rq(NULL); \
__ret; \
})
/*
- * Some kfuncs are allowed only on the tasks that are subjects of the
- * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
- * restrictions, the following SCX_CALL_OP_*() variants should be used when
- * invoking scx_ops operations that take task arguments. These can only be used
- * for non-nesting operations due to the way the tasks are tracked.
- *
- * kfuncs which can only operate on such tasks can in turn use
- * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
- * the specific task.
+ * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments
+ * and records them in current->scx.kf_tasks[] for the duration of the call. A
+ * kfunc invoked from inside such an op can then use
+ * scx_kf_arg_task_ok() to verify that its task argument is one of
+ * those subject tasks.
+ *
+ * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
+ * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock
+ * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
+ * kf_tasks[] is set, @p's scheduler-protected fields are stable.
+ *
+ * kf_tasks[] can not stack, so task-based SCX ops must not nest. The
+ * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
+ * while a previous one is still in progress.
*/
-#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \
do { \
- BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP((sch), mask, op, rq, task, ##args); \
+ SCX_CALL_OP((sch), op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \
({ \
__typeof__((sch)->ops.op(task, ##args)) __ret; \
- BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \
({ \
__typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
- BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ WARN_ON_ONCE(current->scx.kf_tasks[0]); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
})
-/* @mask is constant, always inline to cull unnecessary branches */
-static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
-{
- if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
- return false;
- }
-
- /*
- * Enforce nesting boundaries. e.g. A kfunc which can be called from
- * DISPATCH must not be called if we're running DEQUEUE which is nested
- * inside ops.dispatch(). We don't need to check boundaries for any
- * blocking kfuncs as the verifier ensures they're only called from
- * sleepable progs.
- */
- if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
- (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_error(sch, "cpu_release kfunc called from a nested operation");
- return false;
- }
-
- if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
- (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_error(sch, "dispatch kfunc called from a nested operation");
- return false;
- }
-
- return true;
-}
-
/* see SCX_CALL_OP_TASK() */
-static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
- u32 mask,
+static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch,
struct task_struct *p)
{
- if (!scx_kf_allowed(sch, mask))
- return false;
-
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
scx_error(sch, "called on a task not being operated on");
@@ -430,9 +549,22 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
return true;
}
+enum scx_dsq_iter_flags {
+ /* iterate in the reverse dispatch order */
+ SCX_DSQ_ITER_REV = 1U << 16,
+
+ __SCX_DSQ_ITER_HAS_SLICE = 1U << 30,
+ __SCX_DSQ_ITER_HAS_VTIME = 1U << 31,
+
+ __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV,
+ __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS |
+ __SCX_DSQ_ITER_HAS_SLICE |
+ __SCX_DSQ_ITER_HAS_VTIME,
+};
+
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
- * @dsq: user dsq being iterated
+ * @dsq: non-local dsq being iterated
* @cur: current position, %NULL to start iteration
* @rev: walk backwards
*
@@ -472,6 +604,85 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
for ((p) = nldsq_next_task((dsq), NULL, false); (p); \
(p) = nldsq_next_task((dsq), (p), false))
+/**
+ * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ
+ * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR()
+ * @dsq: non-local dsq being iterated
+ *
+ * Find the next task in a cursor based iteration. The caller must have
+ * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock
+ * between the iteration steps.
+ *
+ * Only tasks which were queued before @cursor was initialized are visible. This
+ * bounds the iteration and guarantees that vtime never jumps in the other
+ * direction while iterating.
+ */
+static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor,
+ struct scx_dispatch_q *dsq)
+{
+ bool rev = cursor->flags & SCX_DSQ_ITER_REV;
+ struct task_struct *p;
+
+ lockdep_assert_held(&dsq->lock);
+ BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR));
+
+ if (list_empty(&cursor->node))
+ p = NULL;
+ else
+ p = container_of(cursor, struct task_struct, scx.dsq_list);
+
+ /* skip cursors and tasks that were queued after @cursor init */
+ do {
+ p = nldsq_next_task(dsq, p, rev);
+ } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq)));
+
+ if (p) {
+ if (rev)
+ list_move_tail(&cursor->node, &p->scx.dsq_list.node);
+ else
+ list_move(&cursor->node, &p->scx.dsq_list.node);
+ } else {
+ list_del_init(&cursor->node);
+ }
+
+ return p;
+}
+
+/**
+ * nldsq_cursor_lost_task - Test whether someone else took the task since iteration
+ * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR()
+ * @rq: rq @p was on
+ * @dsq: dsq @p was on
+ * @p: target task
+ *
+ * @p is a task returned by nldsq_cursor_next_task(). The locks may have been
+ * dropped and re-acquired inbetween. Verify that no one else took or is in the
+ * process of taking @p from @dsq.
+ *
+ * On %false return, the caller can assume full ownership of @p.
+ */
+static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor,
+ struct rq *rq, struct scx_dispatch_q *dsq,
+ struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+ lockdep_assert_held(&dsq->lock);
+
+ /*
+ * @p could have already left $src_dsq, got re-enqueud, or be in the
+ * process of being consumed by someone else.
+ */
+ if (unlikely(p->scx.dsq != dsq ||
+ u32_before(cursor->priv, p->scx.dsq_seq) ||
+ p->scx.holding_cpu >= 0))
+ return true;
+
+ /* if @p has stayed on @dsq, its rq couldn't have changed */
+ if (WARN_ON_ONCE(rq != task_rq(p)))
+ return true;
+
+ return false;
+}
/*
* BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
@@ -479,19 +690,6 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
* changes without breaking backward compatibility. Can be used with
* bpf_for_each(). See bpf_iter_scx_dsq_*().
*/
-enum scx_dsq_iter_flags {
- /* iterate in the reverse dispatch order */
- SCX_DSQ_ITER_REV = 1U << 16,
-
- __SCX_DSQ_ITER_HAS_SLICE = 1U << 30,
- __SCX_DSQ_ITER_HAS_VTIME = 1U << 31,
-
- __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV,
- __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS |
- __SCX_DSQ_ITER_HAS_SLICE |
- __SCX_DSQ_ITER_HAS_VTIME,
-};
-
struct bpf_iter_scx_dsq_kern {
struct scx_dsq_list_node cursor;
struct scx_dispatch_q *dsq;
@@ -514,14 +712,31 @@ struct scx_task_iter {
struct rq_flags rf;
u32 cnt;
bool list_locked;
+#ifdef CONFIG_EXT_SUB_SCHED
+ struct cgroup *cgrp;
+ struct cgroup_subsys_state *css_pos;
+ struct css_task_iter css_iter;
+#endif
};
/**
* scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
* @iter: iterator to init
+ * @cgrp: Optional root of cgroup subhierarchy to iterate
+ *
+ * Initialize @iter. Once initialized, @iter must eventually be stopped with
+ * scx_task_iter_stop().
+ *
+ * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns
+ * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks.
*
- * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
- * must eventually be stopped with scx_task_iter_stop().
+ * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using
+ * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup
+ * task migrations.
+ *
+ * The two modes of iterations are largely independent and it's likely that
+ * scx_tasks can be removed in favor of always using cgroup iteration if
+ * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS.
*
* scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
* between this and the first next() call or between any two next() calls. If
@@ -532,10 +747,19 @@ struct scx_task_iter {
* All tasks which existed when the iteration started are guaranteed to be
* visited as long as they are not dead.
*/
-static void scx_task_iter_start(struct scx_task_iter *iter)
+static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
{
memset(iter, 0, sizeof(*iter));
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (cgrp) {
+ lockdep_assert_held(&cgroup_mutex);
+ iter->cgrp = cgrp;
+ iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self);
+ css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+ return;
+ }
+#endif
raw_spin_lock_irq(&scx_tasks_lock);
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
@@ -588,6 +812,14 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
*/
static void scx_task_iter_stop(struct scx_task_iter *iter)
{
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (iter->cgrp) {
+ if (iter->css_pos)
+ css_task_iter_end(&iter->css_iter);
+ __scx_task_iter_rq_unlock(iter);
+ return;
+ }
+#endif
__scx_task_iter_maybe_relock(iter);
list_del_init(&iter->cursor.tasks_node);
scx_task_iter_unlock(iter);
@@ -611,6 +843,24 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
cond_resched();
}
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (iter->cgrp) {
+ while (iter->css_pos) {
+ struct task_struct *p;
+
+ p = css_task_iter_next(&iter->css_iter);
+ if (p)
+ return p;
+
+ css_task_iter_end(&iter->css_iter);
+ iter->css_pos = css_next_descendant_pre(iter->css_pos,
+ &iter->cgrp->self);
+ if (iter->css_pos)
+ css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+ }
+ return NULL;
+ }
+#endif
__scx_task_iter_maybe_relock(iter);
list_for_each_entry(pos, cursor, tasks_node) {
@@ -810,16 +1060,6 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err
return -EPROTO;
}
-static void run_deferred(struct rq *rq)
-{
- process_ddsp_deferred_locals(rq);
-
- if (local_read(&rq->scx.reenq_local_deferred)) {
- local_set(&rq->scx.reenq_local_deferred, 0);
- reenq_local(rq);
- }
-}
-
static void deferred_bal_cb_workfn(struct rq *rq)
{
run_deferred(rq);
@@ -845,10 +1085,18 @@ static void deferred_irq_workfn(struct irq_work *irq_work)
static void schedule_deferred(struct rq *rq)
{
/*
- * Queue an irq work. They are executed on IRQ re-enable which may take
- * a bit longer than the scheduler hook in schedule_deferred_locked().
+ * This is the fallback when schedule_deferred_locked() can't use
+ * the cheaper balance callback or wakeup hook paths (the target
+ * CPU is not in balance or wakeup). Currently, this is primarily
+ * hit by reenqueue operations targeting a remote CPU.
+ *
+ * Queue on the target CPU. The deferred work can run from any CPU
+ * correctly - the _locked() path already processes remote rqs from
+ * the calling CPU - but targeting the owning CPU allows IPI delivery
+ * without waiting for the calling CPU to re-enable IRQs and is
+ * cheaper as the reenqueue runs locally.
*/
- irq_work_queue(&rq->scx.deferred_irq_work);
+ irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq));
}
/**
@@ -898,6 +1146,81 @@ static void schedule_deferred_locked(struct rq *rq)
schedule_deferred(rq);
}
+static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ u64 reenq_flags, struct rq *locked_rq)
+{
+ struct rq *rq;
+
+ /*
+ * Allowing reenqueues doesn't make sense while bypassing. This also
+ * blocks from new reenqueues to be scheduled on dead scheds.
+ */
+ if (unlikely(READ_ONCE(sch->bypass_depth)))
+ return;
+
+ if (dsq->id == SCX_DSQ_LOCAL) {
+ rq = container_of(dsq, struct rq, scx.local_dsq);
+
+ struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq));
+ struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local;
+
+ /*
+ * Pairs with smp_mb() in process_deferred_reenq_locals() and
+ * guarantees that there is a reenq_local() afterwards.
+ */
+ smp_mb();
+
+ if (list_empty(&drl->node) ||
+ (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) {
+
+ guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
+
+ if (list_empty(&drl->node))
+ list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals);
+ WRITE_ONCE(drl->flags, drl->flags | reenq_flags);
+ }
+ } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) {
+ rq = this_rq();
+
+ struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq));
+ struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user;
+
+ /*
+ * Pairs with smp_mb() in process_deferred_reenq_users() and
+ * guarantees that there is a reenq_user() afterwards.
+ */
+ smp_mb();
+
+ if (list_empty(&dru->node) ||
+ (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) {
+
+ guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
+
+ if (list_empty(&dru->node))
+ list_move_tail(&dru->node, &rq->scx.deferred_reenq_users);
+ WRITE_ONCE(dru->flags, dru->flags | reenq_flags);
+ }
+ } else {
+ scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
+ return;
+ }
+
+ if (rq == locked_rq)
+ schedule_deferred_locked(rq);
+ else
+ schedule_deferred(rq);
+}
+
+static void schedule_reenq_local(struct rq *rq, u64 reenq_flags)
+{
+ struct scx_sched *root = rcu_dereference_sched(scx_root);
+
+ if (WARN_ON_ONCE(!root))
+ return;
+
+ schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq);
+}
+
/**
* touch_core_sched - Update timestamp used for core-sched task ordering
* @rq: rq to read clock from, must be locked
@@ -974,24 +1297,105 @@ static bool scx_dsq_priq_less(struct rb_node *node_a,
return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
}
-static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
+static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags)
{
/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
- WRITE_ONCE(dsq->nr, dsq->nr + delta);
+ WRITE_ONCE(dsq->nr, dsq->nr + 1);
+
+ /*
+ * Once @p reaches a local DSQ, it can only leave it by being dispatched
+ * to the CPU or dequeued. In both cases, the only way @p can go back to
+ * the BPF sched is through enqueueing. If being inserted into a local
+ * DSQ with IMMED, persist the state until the next enqueueing event in
+ * do_enqueue_task() so that we can maintain IMMED protection through
+ * e.g. SAVE/RESTORE cycles and slice extensions.
+ */
+ if (enq_flags & SCX_ENQ_IMMED) {
+ if (unlikely(dsq->id != SCX_DSQ_LOCAL)) {
+ WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK));
+ return;
+ }
+ p->scx.flags |= SCX_TASK_IMMED;
+ }
+
+ if (p->scx.flags & SCX_TASK_IMMED) {
+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+
+ if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
+ return;
+
+ rq->scx.nr_immed++;
+
+ /*
+ * If @rq already had other tasks or the current task is not
+ * done yet, @p can't go on the CPU immediately. Re-enqueue.
+ */
+ if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags)))
+ schedule_reenq_local(rq, 0);
+ }
+}
+
+static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p)
+{
+ /* see dsq_inc_nr() */
+ WRITE_ONCE(dsq->nr, dsq->nr - 1);
+
+ if (p->scx.flags & SCX_TASK_IMMED) {
+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+
+ if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) ||
+ WARN_ON_ONCE(rq->scx.nr_immed <= 0))
+ return;
+
+ rq->scx.nr_immed--;
+ }
}
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
{
- p->scx.slice = READ_ONCE(scx_slice_dfl);
+ p->scx.slice = READ_ONCE(sch->slice_dfl);
__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
+/*
+ * Return true if @p is moving due to an internal SCX migration, false
+ * otherwise.
+ */
+static inline bool task_scx_migrating(struct task_struct *p)
+{
+ /*
+ * We only need to check sticky_cpu: it is set to the destination
+ * CPU in move_remote_task_to_local_dsq() before deactivate_task()
+ * and cleared when the task is enqueued on the destination, so it
+ * is only non-negative during an internal SCX migration.
+ */
+ return p->scx.sticky_cpu >= 0;
+}
+
+/*
+ * Call ops.dequeue() if the task is in BPF custody and not migrating.
+ * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked.
+ */
+static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *p, u64 deq_flags)
+{
+ if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p))
+ return;
+
+ if (SCX_HAS_OP(sch, dequeue))
+ SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags);
+
+ p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
+}
+
static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
u64 enq_flags)
{
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
bool preempt = false;
+ call_task_dequeue(scx_root, rq, p, 0);
+
/*
* If @rq is in balance, the CPU is already vacant and looking for the
* next task to run. No need to preempt or trigger resched after moving
@@ -1010,8 +1414,9 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
resched_curr(rq);
}
-static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
- struct task_struct *p, u64 enq_flags)
+static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dsq, struct task_struct *p,
+ u64 enq_flags)
{
bool is_local = dsq->id == SCX_DSQ_LOCAL;
@@ -1027,7 +1432,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
- dsq = find_global_dsq(sch, p);
+ dsq = find_global_dsq(sch, task_cpu(p));
raw_spin_lock(&dsq->lock);
}
}
@@ -1099,20 +1504,33 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
- dsq->seq++;
+ WRITE_ONCE(dsq->seq, dsq->seq + 1);
p->scx.dsq_seq = dsq->seq;
- dsq_mod_nr(dsq, 1);
+ dsq_inc_nr(dsq, p, enq_flags);
p->scx.dsq = dsq;
/*
- * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
- * direct dispatch path, but we clear them here because the direct
- * dispatch verdict may be overridden on the enqueue path during e.g.
- * bypass.
+ * Update custody and call ops.dequeue() before clearing ops_state:
+ * once ops_state is cleared, waiters in ops_dequeue() can proceed
+ * and dequeue_task_scx() will RMW p->scx.flags. If we clear
+ * ops_state first, both sides would modify p->scx.flags
+ * concurrently in a non-atomic way.
*/
- p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
- p->scx.ddsp_enq_flags = 0;
+ if (is_local) {
+ local_dsq_post_enq(dsq, p, enq_flags);
+ } else {
+ /*
+ * Task on global/bypass DSQ: leave custody, task on
+ * non-terminal DSQ: enter custody.
+ */
+ if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS)
+ call_task_dequeue(sch, rq, p, 0);
+ else
+ p->scx.flags |= SCX_TASK_IN_CUSTODY;
+
+ raw_spin_unlock(&dsq->lock);
+ }
/*
* We're transitioning out of QUEUEING or DISPATCHING. store_release to
@@ -1120,11 +1538,6 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
*/
if (enq_flags & SCX_ENQ_CLEAR_OPSS)
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
-
- if (is_local)
- local_dsq_post_enq(dsq, p, enq_flags);
- else
- raw_spin_unlock(&dsq->lock);
}
static void task_unlink_from_dsq(struct task_struct *p,
@@ -1139,7 +1552,7 @@ static void task_unlink_from_dsq(struct task_struct *p,
}
list_del_init(&p->scx.dsq_list.node);
- dsq_mod_nr(dsq, -1);
+ dsq_dec_nr(dsq, p);
if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
struct task_struct *first_task;
@@ -1218,7 +1631,7 @@ static void dispatch_dequeue_locked(struct task_struct *p,
static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
struct rq *rq, u64 dsq_id,
- struct task_struct *p)
+ s32 tcpu)
{
struct scx_dispatch_q *dsq;
@@ -1229,20 +1642,19 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
- return find_global_dsq(sch, p);
+ return find_global_dsq(sch, tcpu);
return &cpu_rq(cpu)->scx.local_dsq;
}
if (dsq_id == SCX_DSQ_GLOBAL)
- dsq = find_global_dsq(sch, p);
+ dsq = find_global_dsq(sch, tcpu);
else
dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
- dsq_id, p->comm, p->pid);
- return find_global_dsq(sch, p);
+ scx_error(sch, "non-existent DSQ 0x%llx", dsq_id);
+ return find_global_dsq(sch, tcpu);
}
return dsq;
@@ -1279,12 +1691,34 @@ static void mark_direct_dispatch(struct scx_sched *sch,
p->scx.ddsp_enq_flags = enq_flags;
}
+/*
+ * Clear @p direct dispatch state when leaving the scheduler.
+ *
+ * Direct dispatch state must be cleared in the following cases:
+ * - direct_dispatch(): cleared on the synchronous enqueue path, deferred
+ * dispatch keeps the state until consumed
+ * - process_ddsp_deferred_locals(): cleared after consuming deferred state,
+ * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch
+ * verdict is ignored (local/global/bypass)
+ * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred
+ * cancellation and holding_cpu races
+ * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by
+ * the scx_bypass() loop, so that stale state is not reused by a subsequent
+ * scheduler instance
+ */
+static inline void clear_direct_dispatch(struct task_struct *p)
+{
+ p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
+ p->scx.ddsp_enq_flags = 0;
+}
+
static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
u64 enq_flags)
{
struct rq *rq = task_rq(p);
struct scx_dispatch_q *dsq =
- find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
+ find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
+ u64 ddsp_enq_flags;
touch_core_sched_dispatch(rq, p);
@@ -1325,8 +1759,10 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
return;
}
- dispatch_enqueue(sch, dsq, p,
- p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+ ddsp_enq_flags = p->scx.ddsp_enq_flags;
+ clear_direct_dispatch(p);
+
+ dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
}
static bool scx_rq_online(struct rq *rq)
@@ -1344,18 +1780,26 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
int sticky_cpu)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
struct task_struct **ddsp_taskp;
struct scx_dispatch_q *dsq;
unsigned long qseq;
WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
- /* rq migration */
+ /* internal movements - rq migration / RESTORE */
if (sticky_cpu == cpu_of(rq))
goto local_norefill;
/*
+ * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr().
+ * Note that exiting and migration-disabled tasks that skip
+ * ops.enqueue() below will lose IMMED protection unless
+ * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set.
+ */
+ p->scx.flags &= ~SCX_TASK_IMMED;
+
+ /*
* If !scx_rq_online(), we already told the BPF scheduler that the CPU
* is offline and are just running the hotplug path. Don't bother the
* BPF scheduler.
@@ -1363,7 +1807,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (!scx_rq_online(rq))
goto local;
- if (scx_rq_bypassing(rq)) {
+ if (scx_bypassing(sch, cpu_of(rq))) {
__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
goto bypass;
}
@@ -1398,13 +1842,19 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
+ SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags);
*ddsp_taskp = NULL;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
goto direct;
/*
+ * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY
+ * so ops.dequeue() is called when it leaves custody.
+ */
+ p->scx.flags |= SCX_TASK_IN_CUSTODY;
+
+ /*
* If not directly dispatched, QUEUEING isn't clear yet and dispatch or
* dequeue may be waiting. The store_release matches their load_acquire.
*/
@@ -1415,16 +1865,16 @@ direct:
direct_dispatch(sch, p, enq_flags);
return;
local_norefill:
- dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags);
return;
local:
dsq = &rq->scx.local_dsq;
goto enqueue;
global:
- dsq = find_global_dsq(sch, p);
+ dsq = find_global_dsq(sch, task_cpu(p));
goto enqueue;
bypass:
- dsq = &task_rq(p)->scx.bypass_dsq;
+ dsq = bypass_enq_target_dsq(sch, task_cpu(p));
goto enqueue;
enqueue:
@@ -1435,7 +1885,8 @@ enqueue:
*/
touch_core_sched(rq, p);
refill_task_slice_dfl(sch, p);
- dispatch_enqueue(sch, dsq, p, enq_flags);
+ clear_direct_dispatch(p);
+ dispatch_enqueue(sch, rq, dsq, p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -1466,19 +1917,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
}
-static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
int sticky_cpu = p->scx.sticky_cpu;
+ u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;
if (enq_flags & ENQUEUE_WAKEUP)
rq->scx.flags |= SCX_RQ_IN_WAKEUP;
- enq_flags |= rq->scx.extra_enq_flags;
-
- if (sticky_cpu >= 0)
- p->scx.sticky_cpu = -1;
-
/*
* Restoring a running task will be immediately followed by
* set_next_task_scx() which expects the task to not be on the BPF
@@ -1499,7 +1946,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
add_nr_running(rq, 1);
if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
+ SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
@@ -1509,6 +1956,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
dl_server_start(&rq->ext_server);
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
+
+ if (sticky_cpu >= 0)
+ p->scx.sticky_cpu = -1;
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -1519,7 +1969,7 @@ out:
static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
unsigned long opss;
/* dequeue is always temporary, don't reset runnable_at */
@@ -1538,10 +1988,8 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- if (SCX_HAS_OP(sch, dequeue))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
- p, deq_flags);
-
+ /* A queued task must always be in BPF scheduler's custody */
+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY));
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
break;
@@ -1564,11 +2012,35 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
break;
}
+
+ /*
+ * Call ops.dequeue() if the task is still in BPF custody.
+ *
+ * The code that clears ops_state to %SCX_OPSS_NONE does not always
+ * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when
+ * we're moving a task that was in %SCX_OPSS_DISPATCHING to a
+ * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE
+ * so that a concurrent dequeue can proceed, but we clear
+ * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the
+ * task. So we can see NONE + IN_CUSTODY here and we must handle
+ * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see
+ * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until
+ * it is enqueued on the destination.
+ */
+ call_task_dequeue(sch, rq, p, deq_flags);
}
-static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
+static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
+ u64 deq_flags = core_deq_flags;
+
+ /*
+ * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property
+ * change (not sleep or core-sched pick).
+ */
+ if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC)))
+ deq_flags |= SCX_DEQ_SCHED_CHANGE;
if (!(p->scx.flags & SCX_TASK_QUEUED)) {
WARN_ON_ONCE(task_runnable(p));
@@ -1591,11 +2063,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
*/
if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
update_curr_scx(rq);
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
+ SCX_CALL_OP_TASK(sch, stopping, rq, p, false);
}
if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
+ SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -1607,32 +2079,56 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
sub_nr_running(rq, 1);
dispatch_dequeue(rq, p);
+ clear_direct_dispatch(p);
return true;
}
static void yield_task_scx(struct rq *rq)
{
- struct scx_sched *sch = scx_root;
struct task_struct *p = rq->donor;
+ struct scx_sched *sch = scx_task_sched(p);
if (SCX_HAS_OP(sch, yield))
- SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
+ SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL);
else
p->scx.slice = 0;
}
static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
{
- struct scx_sched *sch = scx_root;
struct task_struct *from = rq->donor;
+ struct scx_sched *sch = scx_task_sched(from);
- if (SCX_HAS_OP(sch, yield))
- return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
- from, to);
+ if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to))
+ return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to);
else
return false;
}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ /*
+ * Preemption between SCX tasks is implemented by resetting the victim
+ * task's slice to 0 and triggering reschedule on the target CPU.
+ * Nothing to do.
+ */
+ if (p->sched_class == &ext_sched_class)
+ return;
+
+ /*
+ * Getting preempted by a higher-priority class. Reenqueue IMMED tasks.
+ * This captures all preemption cases including:
+ *
+ * - A SCX task is currently running.
+ *
+ * - @rq is waking from idle due to a SCX task waking to it.
+ *
+ * - A higher-priority wakes up while SCX dispatch is in progress.
+ */
+ if (rq->scx.nr_immed)
+ schedule_reenq_local(rq, 0);
+}
+
static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *src_dsq,
struct rq *dst_rq)
@@ -1650,7 +2146,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
else
list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
- dsq_mod_nr(dst_dsq, 1);
+ dsq_inc_nr(dst_dsq, p, enq_flags);
p->scx.dsq = dst_dsq;
local_dsq_post_enq(dst_dsq, p, enq_flags);
@@ -1670,10 +2166,13 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
{
lockdep_assert_rq_held(src_rq);
- /* the following marks @p MIGRATING which excludes dequeue */
+ /*
+ * Set sticky_cpu before deactivate_task() to properly mark the
+ * beginning of an SCX-internal migration.
+ */
+ p->scx.sticky_cpu = cpu_of(dst_rq);
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu_of(dst_rq));
- p->scx.sticky_cpu = cpu_of(dst_rq);
raw_spin_rq_unlock(src_rq);
raw_spin_rq_lock(dst_rq);
@@ -1713,7 +2212,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch,
struct task_struct *p, struct rq *rq,
bool enforce)
{
- int cpu = cpu_of(rq);
+ s32 cpu = cpu_of(rq);
WARN_ON_ONCE(task_cpu(p) == cpu);
@@ -1807,13 +2306,14 @@ static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
!WARN_ON_ONCE(src_rq != task_rq(p));
}
-static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
+static bool consume_remote_task(struct rq *this_rq,
+ struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *dsq, struct rq *src_rq)
{
raw_spin_rq_unlock(this_rq);
if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
- move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
+ move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq);
return true;
} else {
raw_spin_rq_unlock(src_rq);
@@ -1853,8 +2353,9 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dst_dsq = find_global_dsq(sch, p);
+ dst_dsq = find_global_dsq(sch, task_cpu(p));
dst_rq = src_rq;
+ enq_flags |= SCX_ENQ_GDSQ_FALLBACK;
}
} else {
/* no need to migrate if destination is a non-local DSQ */
@@ -1885,14 +2386,14 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
dispatch_dequeue_locked(p, src_dsq);
raw_spin_unlock(&src_dsq->lock);
- dispatch_enqueue(sch, dst_dsq, p, enq_flags);
+ dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags);
}
return dst_rq;
}
static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
- struct scx_dispatch_q *dsq)
+ struct scx_dispatch_q *dsq, u64 enq_flags)
{
struct task_struct *p;
retry:
@@ -1917,18 +2418,18 @@ retry:
* the system into the bypass mode. This can easily live-lock the
* machine. If aborting, exit from all non-bypass DSQs.
*/
- if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
+ if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS)
break;
if (rq == task_rq) {
task_unlink_from_dsq(p, dsq);
- move_local_task_to_local_dsq(p, 0, dsq, rq);
+ move_local_task_to_local_dsq(p, enq_flags, dsq, rq);
raw_spin_unlock(&dsq->lock);
return true;
}
if (task_can_run_on_remote_rq(sch, p, rq, false)) {
- if (likely(consume_remote_task(rq, p, dsq, task_rq)))
+ if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq)))
return true;
goto retry;
}
@@ -1942,7 +2443,7 @@ static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
{
int node = cpu_to_node(cpu_of(rq));
- return consume_dispatch_q(sch, rq, sch->global_dsqs[node]);
+ return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0);
}
/**
@@ -1975,15 +2476,15 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
* If dispatching to @rq that @p is already on, no lock dancing needed.
*/
if (rq == src_rq && rq == dst_rq) {
- dispatch_enqueue(sch, dst_dsq, p,
+ dispatch_enqueue(sch, rq, dst_dsq, p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dispatch_enqueue(sch, find_global_dsq(sch, p), p,
- enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK);
return;
}
@@ -2020,7 +2521,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
*/
if (src_rq == dst_rq) {
p->scx.holding_cpu = -1;
- dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
+ dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p,
enq_flags);
} else {
move_remote_task_to_local_dsq(p, enq_flags,
@@ -2090,6 +2591,12 @@ retry:
if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
return;
+ /* see SCX_EV_INSERT_NOT_OWNED definition */
+ if (unlikely(!scx_task_on_sched(sch, p))) {
+ __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1);
+ return;
+ }
+
/*
* While we know @p is accessible, we don't yet have a claim on
* it - the BPF scheduler is allowed to dispatch tasks
@@ -2114,17 +2621,17 @@ retry:
BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
- dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p));
if (dsq->id == SCX_DSQ_LOCAL)
dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
else
- dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
}
static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
{
- struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
u32 u;
for (u = 0; u < dspc->cursor; u++) {
@@ -2151,67 +2658,54 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
}
-static int balance_one(struct rq *rq, struct task_struct *prev)
+/*
+ * One user of this function is scx_bpf_dispatch() which can be called
+ * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
+ * from the call frame.
+ */
+static __always_inline bool
+scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *prev, bool nested)
{
- struct scx_sched *sch = scx_root;
- struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
- bool prev_on_scx = prev->sched_class == &ext_sched_class;
- bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
+ struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
int nr_loops = SCX_DSP_MAX_LOOPS;
+ s32 cpu = cpu_of(rq);
+ bool prev_on_sch = (prev->sched_class == &ext_sched_class) &&
+ scx_task_on_sched(sch, prev);
- lockdep_assert_rq_held(rq);
- rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
-
- if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
- unlikely(rq->scx.cpu_released)) {
- /*
- * If the previous sched_class for the current CPU was not SCX,
- * notify the BPF scheduler that it again has control of the
- * core. This callback complements ->cpu_release(), which is
- * emitted in switch_class().
- */
- if (SCX_HAS_OP(sch, cpu_acquire))
- SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
- cpu_of(rq), NULL);
- rq->scx.cpu_released = false;
- }
+ if (consume_global_dsq(sch, rq))
+ return true;
- if (prev_on_scx) {
- update_curr_scx(rq);
+ if (bypass_dsp_enabled(sch)) {
+ /* if @sch is bypassing, only the bypass DSQs are active */
+ if (scx_bypassing(sch, cpu))
+ return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0);
+#ifdef CONFIG_EXT_SUB_SCHED
/*
- * If @prev is runnable & has slice left, it has priority and
- * fetching more just increases latency for the fetched tasks.
- * Tell pick_task_scx() to keep running @prev. If the BPF
- * scheduler wants to handle this explicitly, it should
- * implement ->cpu_release().
+ * If @sch isn't bypassing but its children are, @sch is
+ * responsible for making forward progress for both its own
+ * tasks that aren't bypassing and the bypassing descendants'
+ * tasks. The following implements a simple built-in behavior -
+ * let each CPU try to run the bypass DSQ every Nth time.
*
- * See scx_disable_workfn() for the explanation on the bypassing
- * test.
+ * Later, if necessary, we can add an ops flag to suppress the
+ * auto-consumption and a kfunc to consume the bypass DSQ and,
+ * so that the BPF scheduler can fully control scheduling of
+ * bypassed tasks.
*/
- if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
- rq->scx.flags |= SCX_RQ_BAL_KEEP;
- goto has_tasks;
- }
- }
+ struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
- /* if there already are tasks to run, nothing to do */
- if (rq->scx.local_dsq.nr)
- goto has_tasks;
-
- if (consume_global_dsq(sch, rq))
- goto has_tasks;
-
- if (scx_rq_bypassing(rq)) {
- if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
- goto has_tasks;
- else
- goto no_tasks;
+ if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
+ consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) {
+ __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1);
+ return true;
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
}
if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
- goto no_tasks;
+ return false;
dspc->rq = rq;
@@ -2225,19 +2719,25 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
- cpu_of(rq), prev_on_scx ? prev : NULL);
+ if (nested) {
+ SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL);
+ } else {
+ /* stash @prev so that nested invocations can access it */
+ rq->scx.sub_dispatch_prev = prev;
+ SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL);
+ rq->scx.sub_dispatch_prev = NULL;
+ }
flush_dispatch_buf(sch, rq);
- if (prev_on_rq && prev->scx.slice) {
+ if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
- goto has_tasks;
+ return true;
}
if (rq->scx.local_dsq.nr)
- goto has_tasks;
+ return true;
if (consume_global_dsq(sch, rq))
- goto has_tasks;
+ return true;
/*
* ops.dispatch() can trap us in this loop by repeatedly
@@ -2246,21 +2746,80 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* balance(), we want to complete this scheduling cycle and then
* start a new one. IOW, we want to call resched_curr() on the
* next, most likely idle, task, not the current one. Use
- * scx_kick_cpu() for deferred kicking.
+ * __scx_bpf_kick_cpu() for deferred kicking.
*/
if (unlikely(!--nr_loops)) {
- scx_kick_cpu(sch, cpu_of(rq), 0);
+ scx_kick_cpu(sch, cpu, 0);
break;
}
} while (dspc->nr_tasks);
-no_tasks:
+ /*
+ * Prevent the CPU from going idle while bypassed descendants have tasks
+ * queued. Without this fallback, bypassed tasks could stall if the host
+ * scheduler's ops.dispatch() doesn't yield any tasks.
+ */
+ if (bypass_dsp_enabled(sch))
+ return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0);
+
+ return false;
+}
+
+static int balance_one(struct rq *rq, struct task_struct *prev)
+{
+ struct scx_sched *sch = scx_root;
+ s32 cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+ rq->scx.flags |= SCX_RQ_IN_BALANCE;
+ rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+
+ if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
+ unlikely(rq->scx.cpu_released)) {
+ /*
+ * If the previous sched_class for the current CPU was not SCX,
+ * notify the BPF scheduler that it again has control of the
+ * core. This callback complements ->cpu_release(), which is
+ * emitted in switch_class().
+ */
+ if (SCX_HAS_OP(sch, cpu_acquire))
+ SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL);
+ rq->scx.cpu_released = false;
+ }
+
+ if (prev->sched_class == &ext_sched_class) {
+ update_curr_scx(rq);
+
+ /*
+ * If @prev is runnable & has slice left, it has priority and
+ * fetching more just increases latency for the fetched tasks.
+ * Tell pick_task_scx() to keep running @prev. If the BPF
+ * scheduler wants to handle this explicitly, it should
+ * implement ->cpu_release().
+ *
+ * See scx_disable_workfn() for the explanation on the bypassing
+ * test.
+ */
+ if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice &&
+ !scx_bypassing(sch, cpu)) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
+ }
+
+ /* if there already are tasks to run, nothing to do */
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
+
+ if (scx_dispatch_sched(sch, rq, prev, false))
+ goto has_tasks;
+
/*
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if (prev_on_rq &&
- (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
+ if ((prev->scx.flags & SCX_TASK_QUEUED) &&
+ (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
__scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
@@ -2269,40 +2828,26 @@ no_tasks:
return false;
has_tasks:
- rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
- return true;
-}
-
-static void process_ddsp_deferred_locals(struct rq *rq)
-{
- struct task_struct *p;
-
- lockdep_assert_rq_held(rq);
-
/*
- * Now that @rq can be unlocked, execute the deferred enqueueing of
- * tasks directly dispatched to the local DSQs of other CPUs. See
- * direct_dispatch(). Keep popping from the head instead of using
- * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
- * temporarily.
+ * @rq may have extra IMMED tasks without reenq scheduled:
+ *
+ * - rq_is_open() can't reliably tell when and how slice is going to be
+ * modified for $curr and allows IMMED tasks to be queued while
+ * dispatch is in progress.
+ *
+ * - A non-IMMED HEAD task can get queued in front of an IMMED task
+ * between the IMMED queueing and the subsequent scheduling event.
*/
- while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
- struct task_struct, scx.dsq_list.node))) {
- struct scx_sched *sch = scx_root;
- struct scx_dispatch_q *dsq;
+ if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed))
+ schedule_reenq_local(rq, 0);
- list_del_init(&p->scx.dsq_list.node);
-
- dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
- if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
- dispatch_to_local_dsq(sch, rq, dsq, p,
- p->scx.ddsp_enq_flags);
- }
+ rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+ return true;
}
static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
if (p->scx.flags & SCX_TASK_QUEUED) {
/*
@@ -2317,7 +2862,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
/* see dequeue_task_scx() on why we skip when !QUEUED */
if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
+ SCX_CALL_OP_TASK(sch, running, rq, p);
clr_task_runnable(p, true);
@@ -2389,8 +2934,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
.task = next,
};
- SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq,
- cpu_of(rq), &args);
+ SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args);
}
rq->scx.cpu_released = true;
}
@@ -2399,16 +2943,16 @@ static void switch_class(struct rq *rq, struct task_struct *next)
static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
struct task_struct *next)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
- /* see kick_cpus_irq_workfn() */
+ /* see kick_sync_wait_bal_cb() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
+ SCX_CALL_OP_TASK(sch, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
set_task_runnable(rq, p);
@@ -2417,11 +2961,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* If @p has slice left and is being put, @p is getting
* preempted by a higher priority scheduler class or core-sched
* forcing a different task. Leave it at the head of the local
- * DSQ.
+ * DSQ unless it was an IMMED task. IMMED tasks should not
+ * linger on a busy CPU, reenqueue them to the BPF scheduler.
*/
- if (p->scx.slice && !scx_rq_bypassing(rq)) {
- dispatch_enqueue(sch, &rq->scx.local_dsq, p,
- SCX_ENQ_HEAD);
+ if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) {
+ if (p->scx.flags & SCX_TASK_IMMED) {
+ p->scx.flags |= SCX_TASK_REENQ_PREEMPTED;
+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ } else {
+ dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+ }
goto switch_class;
}
@@ -2444,6 +2994,48 @@ switch_class:
switch_class(rq, next);
}
+static void kick_sync_wait_bal_cb(struct rq *rq)
+{
+ struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
+ unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
+ bool waited;
+ s32 cpu;
+
+ /*
+ * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
+ * — a target CPU may be waiting for us to process an IPI (e.g. TLB
+ * flush) while we wait for its kick_sync to advance.
+ *
+ * Also, keep advancing our own kick_sync so that new kick_sync waits
+ * targeting us, which can start after we drop the lock, cannot form
+ * cyclic dependencies.
+ */
+retry:
+ waited = false;
+ for_each_cpu(cpu, rq->scx.cpus_to_sync) {
+ /*
+ * smp_load_acquire() pairs with smp_store_release() on
+ * kick_sync updates on the target CPUs.
+ */
+ if (cpu == cpu_of(rq) ||
+ smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
+ cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
+ continue;
+ }
+
+ raw_spin_rq_unlock_irq(rq);
+ while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+ cpu_relax();
+ }
+ raw_spin_rq_lock_irq(rq);
+ waited = true;
+ }
+
+ if (waited)
+ goto retry;
+}
+
static struct task_struct *first_local_task(struct rq *rq)
{
return list_first_entry_or_null(&rq->scx.local_dsq.list,
@@ -2457,10 +3049,10 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
bool keep_prev;
struct task_struct *p;
- /* see kick_cpus_irq_workfn() */
+ /* see kick_sync_wait_bal_cb() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq->next_class = &ext_sched_class;
+ rq_modified_begin(rq, &ext_sched_class);
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -2468,6 +3060,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
maybe_queue_balance_callback(rq);
/*
+ * Defer to a balance callback which can drop rq lock and enable
+ * IRQs. Waiting directly in the pick path would deadlock against
+ * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
+ */
+ if (unlikely(rq->scx.kick_sync_pending)) {
+ rq->scx.kick_sync_pending = false;
+ queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
+ kick_sync_wait_bal_cb);
+ }
+
+ /*
* If any higher-priority sched class enqueued a runnable task on
* this rq during balance_one(), abort and return RETRY_TASK, so
* that the scheduler loop can restart.
@@ -2475,7 +3078,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
* If @force_scx is true, always try to pick a SCHED_EXT task,
* regardless of any higher-priority sched classes activity.
*/
- if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2493,16 +3096,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
if (keep_prev) {
p = prev;
if (!p->scx.slice)
- refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
+ refill_task_slice_dfl(scx_task_sched(p), p);
} else {
p = first_local_task(rq);
if (!p)
return NULL;
if (unlikely(!p->scx.slice)) {
- struct scx_sched *sch = rcu_dereference_sched(scx_root);
+ struct scx_sched *sch = scx_task_sched(p);
- if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
+ if (!scx_bypassing(sch, cpu_of(rq)) &&
+ !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
sch->warned_zero_slice = true;
@@ -2568,16 +3172,17 @@ void ext_server_init(struct rq *rq)
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch_a = scx_task_sched(a);
+ struct scx_sched *sch_b = scx_task_sched(b);
/*
* The const qualifiers are dropped from task_struct pointers when
* calling ops.core_sched_before(). Accesses are controlled by the
* verifier.
*/
- if (SCX_HAS_OP(sch, core_sched_before) &&
- !scx_rq_bypassing(task_rq(a)))
- return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
+ if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) &&
+ !scx_bypassing(sch_a, task_cpu(a)))
+ return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before,
NULL,
(struct task_struct *)a,
(struct task_struct *)b);
@@ -2588,8 +3193,8 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
- struct scx_sched *sch = scx_root;
- bool rq_bypass;
+ struct scx_sched *sch = scx_task_sched(p);
+ bool bypassing;
/*
* sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
@@ -2604,8 +3209,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
if (unlikely(wake_flags & WF_EXEC))
return prev_cpu;
- rq_bypass = scx_rq_bypassing(task_rq(p));
- if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
+ bypassing = scx_bypassing(sch, task_cpu(p));
+ if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -2613,10 +3218,9 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- cpu = SCX_CALL_OP_TASK_RET(sch,
- SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
- select_cpu, NULL, p, prev_cpu,
- wake_flags);
+ this_rq()->scx.in_select_cpu = true;
+ cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags);
+ this_rq()->scx.in_select_cpu = false;
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
@@ -2635,7 +3239,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
}
p->scx.selected_cpu = cpu;
- if (rq_bypass)
+ if (bypassing)
__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
@@ -2649,7 +3253,7 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_scx(struct task_struct *p,
struct affinity_context *ac)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
set_cpus_allowed_common(p, ac);
@@ -2665,14 +3269,13 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* designation pointless. Cast it away when calling the operation.
*/
if (SCX_HAS_OP(sch, set_cpumask))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
- p, (struct cpumask *)p->cpus_ptr);
+ SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
{
struct scx_sched *sch = scx_root;
- int cpu = cpu_of(rq);
+ s32 cpu = cpu_of(rq);
atomic_long_inc(&scx_hotplug_seq);
@@ -2688,9 +3291,9 @@ static void handle_hotplug(struct rq *rq, bool online)
scx_idle_update_selcpu_topology(&sch->ops);
if (online && SCX_HAS_OP(sch, cpu_online))
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
+ SCX_CALL_OP(sch, cpu_online, NULL, cpu);
else if (!online && SCX_HAS_OP(sch, cpu_offline))
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
+ SCX_CALL_OP(sch, cpu_offline, NULL, cpu);
else
scx_exit(sch, SCX_EXIT_UNREG_KERN,
SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
@@ -2718,7 +3321,6 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-
static bool check_rq_for_timeouts(struct rq *rq)
{
struct scx_sched *sch;
@@ -2732,10 +3334,11 @@ static bool check_rq_for_timeouts(struct rq *rq)
goto out_unlock;
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+ struct scx_sched *sch = scx_task_sched(p);
unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies,
- last_runnable + scx_watchdog_timeout))) {
+ last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -2752,6 +3355,7 @@ out_unlock:
static void scx_watchdog_workfn(struct work_struct *work)
{
+ unsigned long intv;
int cpu;
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
@@ -2762,28 +3366,30 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched();
}
- queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- scx_watchdog_timeout / 2);
+
+ intv = READ_ONCE(scx_watchdog_interval);
+ if (intv < ULONG_MAX)
+ queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv);
}
void scx_tick(struct rq *rq)
{
- struct scx_sched *sch;
+ struct scx_sched *root;
unsigned long last_check;
if (!scx_enabled())
return;
- sch = rcu_dereference_bh(scx_root);
- if (unlikely(!sch))
+ root = rcu_dereference_bh(scx_root);
+ if (unlikely(!root))
return;
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
- last_check + READ_ONCE(scx_watchdog_timeout)))) {
+ last_check + READ_ONCE(root->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
"watchdog failed to check in for %u.%03us",
dur_ms / 1000, dur_ms % 1000);
}
@@ -2793,7 +3399,7 @@ void scx_tick(struct rq *rq)
static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(curr);
update_curr_scx(rq);
@@ -2801,11 +3407,11 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
* While disabling, always resched and refresh core-sched timestamp as
* we can't trust the slice management or ops.core_sched_before().
*/
- if (scx_rq_bypassing(rq)) {
+ if (scx_bypassing(sch, cpu_of(rq))) {
curr->scx.slice = 0;
touch_core_sched(rq, curr);
} else if (SCX_HAS_OP(sch, tick)) {
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
+ SCX_CALL_OP_TASK(sch, tick, rq, curr);
}
if (!curr->scx.slice)
@@ -2834,18 +3440,16 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
#endif /* CONFIG_EXT_GROUP_SCHED */
-static enum scx_task_state scx_get_task_state(const struct task_struct *p)
+static u32 scx_get_task_state(const struct task_struct *p)
{
- return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
+ return p->scx.flags & SCX_TASK_STATE_MASK;
}
-static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
+static void scx_set_task_state(struct task_struct *p, u32 state)
{
- enum scx_task_state prev_state = scx_get_task_state(p);
+ u32 prev_state = scx_get_task_state(p);
bool warn = false;
- BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
-
switch (state) {
case SCX_TASK_NONE:
break;
@@ -2859,42 +3463,45 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
warn = prev_state != SCX_TASK_READY;
break;
default:
- warn = true;
+ WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
+ prev_state, state, p->comm, p->pid);
return;
}
- WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
+ WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
prev_state, state, p->comm, p->pid);
p->scx.flags &= ~SCX_TASK_STATE_MASK;
- p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
+ p->scx.flags |= state;
}
-static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
{
- struct scx_sched *sch = scx_root;
int ret;
p->scx.disallow = false;
if (SCX_HAS_OP(sch, init_task)) {
struct scx_init_task_args args = {
- SCX_INIT_TASK_ARGS_CGROUP(tg)
+ SCX_INIT_TASK_ARGS_CGROUP(task_group(p))
.fork = fork,
};
- ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
- p, &args);
+ ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args);
if (unlikely(ret)) {
ret = ops_sanitize_err(sch, "init_task", ret);
return ret;
}
}
- scx_set_task_state(p, SCX_TASK_INIT);
-
if (p->scx.disallow) {
- if (!fork) {
+ if (unlikely(scx_parent(sch))) {
+ scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]",
+ p->comm, p->pid);
+ } else if (unlikely(fork)) {
+ scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
+ } else {
struct rq *rq;
struct rq_flags rf;
@@ -2913,25 +3520,43 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
}
task_rq_unlock(rq, p, &rf);
- } else if (p->policy == SCHED_EXT) {
- scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
- p->comm, p->pid);
}
}
- p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
return 0;
}
-static void scx_enable_task(struct task_struct *p)
+static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
+{
+ int ret;
+
+ ret = __scx_init_task(sch, p, fork);
+ if (!ret) {
+ /*
+ * While @p's rq is not locked. @p is not visible to the rest of
+ * SCX yet and it's safe to update the flags and state.
+ */
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+ scx_set_task_state(p, SCX_TASK_INIT);
+ }
+ return ret;
+}
+
+static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
struct rq *rq = task_rq(p);
u32 weight;
lockdep_assert_rq_held(rq);
/*
+ * Verify the task is not in BPF scheduler's custody. If flag
+ * transitions are consistent, the flag should always be clear
+ * here.
+ */
+ WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
+
+ /*
* Set the weight before calling ops.enable() so that the scheduler
* doesn't see a stale value if they inspect the task struct.
*/
@@ -2943,34 +3568,47 @@ static void scx_enable_task(struct task_struct *p)
p->scx.weight = sched_weight_to_cgroup(weight);
if (SCX_HAS_OP(sch, enable))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
- scx_set_task_state(p, SCX_TASK_ENABLED);
+ SCX_CALL_OP_TASK(sch, enable, rq, p);
if (SCX_HAS_OP(sch, set_weight))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
- p, p->scx.weight);
+ SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight);
}
-static void scx_disable_task(struct task_struct *p)
+static void scx_enable_task(struct scx_sched *sch, struct task_struct *p)
+{
+ __scx_enable_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_ENABLED);
+}
+
+static void scx_disable_task(struct scx_sched *sch, struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
+ clear_direct_dispatch(p);
+
if (SCX_HAS_OP(sch, disable))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
+ SCX_CALL_OP_TASK(sch, disable, rq, p);
scx_set_task_state(p, SCX_TASK_READY);
+
+ /*
+ * Verify the task is not in BPF scheduler's custody. If flag
+ * transitions are consistent, the flag should always be clear
+ * here.
+ */
+ WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
}
-static void scx_exit_task(struct task_struct *p)
+static void __scx_disable_and_exit_task(struct scx_sched *sch,
+ struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
struct scx_exit_task_args args = {
.cancelled = false,
};
+ lockdep_assert_held(&p->pi_lock);
lockdep_assert_rq_held(task_rq(p));
switch (scx_get_task_state(p)) {
@@ -2982,7 +3620,7 @@ static void scx_exit_task(struct task_struct *p)
case SCX_TASK_READY:
break;
case SCX_TASK_ENABLED:
- scx_disable_task(p);
+ scx_disable_task(sch, p);
break;
default:
WARN_ON_ONCE(true);
@@ -2990,8 +3628,26 @@ static void scx_exit_task(struct task_struct *p)
}
if (SCX_HAS_OP(sch, exit_task))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
- p, &args);
+ SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
+}
+
+static void scx_disable_and_exit_task(struct scx_sched *sch,
+ struct task_struct *p)
+{
+ __scx_disable_and_exit_task(sch, p);
+
+ /*
+ * If set, @p exited between __scx_init_task() and scx_enable_task() in
+ * scx_sub_enable() and is initialized for both the associated sched and
+ * its parent. Disable and exit for the child too.
+ */
+ if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
+ !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
+ __scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+
+ scx_set_task_sched(p, NULL);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3005,7 +3661,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
INIT_LIST_HEAD(&scx->runnable_node);
scx->runnable_at = jiffies;
scx->ddsp_dsq_id = SCX_DSQ_INVALID;
- scx->slice = READ_ONCE(scx_slice_dfl);
+ scx->slice = SCX_SLICE_DFL;
}
void scx_pre_fork(struct task_struct *p)
@@ -3019,14 +3675,25 @@ void scx_pre_fork(struct task_struct *p)
percpu_down_read(&scx_fork_rwsem);
}
-int scx_fork(struct task_struct *p)
+int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
+ s32 ret;
+
percpu_rwsem_assert_held(&scx_fork_rwsem);
- if (scx_init_task_enabled)
- return scx_init_task(p, task_group(p), true);
- else
- return 0;
+ if (scx_init_task_enabled) {
+#ifdef CONFIG_EXT_SUB_SCHED
+ struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
+#else
+ struct scx_sched *sch = scx_root;
+#endif
+ ret = scx_init_task(sch, p, true);
+ if (!ret)
+ scx_set_task_sched(p, sch);
+ return ret;
+ }
+
+ return 0;
}
void scx_post_fork(struct task_struct *p)
@@ -3044,7 +3711,7 @@ void scx_post_fork(struct task_struct *p)
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_enable_task(p);
+ scx_enable_task(scx_task_sched(p), p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3064,7 +3731,7 @@ void scx_cancel_fork(struct task_struct *p)
rq = task_rq_lock(p, &rf);
WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
- scx_exit_task(p);
+ scx_disable_and_exit_task(scx_task_sched(p), p);
task_rq_unlock(rq, p, &rf);
}
@@ -3115,15 +3782,15 @@ void sched_ext_dead(struct task_struct *p)
raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
- * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
- * transitions can't race us. Disable ops for @p.
+ * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
+ * ENABLED transitions can't race us. Disable ops for @p.
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_exit_task(p);
+ scx_disable_and_exit_task(scx_task_sched(p), p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3131,7 +3798,7 @@ void sched_ext_dead(struct task_struct *p)
static void reweight_task_scx(struct rq *rq, struct task_struct *p,
const struct load_weight *lw)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
lockdep_assert_rq_held(task_rq(p));
@@ -3140,8 +3807,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
if (SCX_HAS_OP(sch, set_weight))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
- p, p->scx.weight);
+ SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight);
}
static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
@@ -3150,20 +3816,19 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = scx_task_sched(p);
if (task_dead_and_done(p))
return;
- scx_enable_task(p);
+ scx_enable_task(sch, p);
/*
* set_cpus_allowed_scx() is not called while @p is associated with a
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
if (SCX_HAS_OP(sch, set_cpumask))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
- p, (struct cpumask *)p->cpus_ptr);
+ SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
@@ -3171,11 +3836,9 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
if (task_dead_and_done(p))
return;
- scx_disable_task(p);
+ scx_disable_task(scx_task_sched(p), p);
}
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
-
static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3190,17 +3853,327 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
return 0;
}
+static void process_ddsp_deferred_locals(struct rq *rq)
+{
+ struct task_struct *p;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Now that @rq can be unlocked, execute the deferred enqueueing of
+ * tasks directly dispatched to the local DSQs of other CPUs. See
+ * direct_dispatch(). Keep popping from the head instead of using
+ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
+ * temporarily.
+ */
+ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
+ struct task_struct, scx.dsq_list.node))) {
+ struct scx_sched *sch = scx_task_sched(p);
+ struct scx_dispatch_q *dsq;
+ u64 dsq_id = p->scx.ddsp_dsq_id;
+ u64 enq_flags = p->scx.ddsp_enq_flags;
+
+ list_del_init(&p->scx.dsq_list.node);
+ clear_direct_dispatch(p);
+
+ dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p));
+ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
+ dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
+ }
+}
+
+/*
+ * Determine whether @p should be reenqueued from a local DSQ.
+ *
+ * @reenq_flags is mutable and accumulates state across the DSQ walk:
+ *
+ * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First"
+ * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at
+ * the head consumes the first slot.
+ *
+ * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if
+ * rq_is_open() is true.
+ *
+ * An IMMED task is kept (returns %false) only if it's the first task in the DSQ
+ * AND the current task is done — i.e. it will execute immediately. All other
+ * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head,
+ * every IMMED task behind it gets reenqueued.
+ *
+ * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ |
+ * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local
+ * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers
+ * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT
+ * in process_deferred_reenq_locals().
+ */
+static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason)
+{
+ bool first;
+
+ first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST);
+ *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST;
+
+ *reason = SCX_TASK_REENQ_KFUNC;
+
+ if ((p->scx.flags & SCX_TASK_IMMED) &&
+ (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) {
+ __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1);
+ *reason = SCX_TASK_REENQ_IMMED;
+ return true;
+ }
+
+ return *reenq_flags & SCX_REENQ_ANY;
+}
+
+static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
+{
+ LIST_HEAD(tasks);
+ u32 nr_enqueued = 0;
+ struct task_struct *p, *n;
+
+ lockdep_assert_rq_held(rq);
+
+ if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK))
+ reenq_flags &= ~__SCX_REENQ_TSR_MASK;
+ if (rq_is_open(rq, 0))
+ reenq_flags |= SCX_REENQ_TSR_RQ_OPEN;
+
+ /*
+ * The BPF scheduler may choose to dispatch tasks back to
+ * @rq->scx.local_dsq. Move all candidate tasks off to a private list
+ * first to avoid processing the same tasks repeatedly.
+ */
+ list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
+ scx.dsq_list.node) {
+ struct scx_sched *task_sch = scx_task_sched(p);
+ u32 reason;
+
+ /*
+ * If @p is being migrated, @p's current CPU may not agree with
+ * its allowed CPUs and the migration_cpu_stop is about to
+ * deactivate and re-activate @p anyway. Skip re-enqueueing.
+ *
+ * While racing sched property changes may also dequeue and
+ * re-enqueue a migrating task while its current CPU and allowed
+ * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
+ * the current local DSQ for running tasks and thus are not
+ * visible to the BPF scheduler.
+ */
+ if (p->migration_pending)
+ continue;
+
+ if (!scx_is_descendant(task_sch, sch))
+ continue;
+
+ if (!local_task_should_reenq(p, &reenq_flags, &reason))
+ continue;
+
+ dispatch_dequeue(rq, p);
+
+ if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ p->scx.flags |= reason;
+
+ list_add_tail(&p->scx.dsq_list.node, &tasks);
+ }
+
+ list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
+ list_del_init(&p->scx.dsq_list.node);
+
+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ nr_enqueued++;
+ }
+
+ return nr_enqueued;
+}
+
+static void process_deferred_reenq_locals(struct rq *rq)
+{
+ u64 seq = ++rq->scx.deferred_reenq_locals_seq;
+
+ lockdep_assert_rq_held(rq);
+
+ while (true) {
+ struct scx_sched *sch;
+ u64 reenq_flags;
+ bool skip = false;
+
+ scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+ struct scx_deferred_reenq_local *drl =
+ list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
+ struct scx_deferred_reenq_local,
+ node);
+ struct scx_sched_pcpu *sch_pcpu;
+
+ if (!drl)
+ return;
+
+ sch_pcpu = container_of(drl, struct scx_sched_pcpu,
+ deferred_reenq_local);
+ sch = sch_pcpu->sch;
+
+ reenq_flags = drl->flags;
+ WRITE_ONCE(drl->flags, 0);
+ list_del_init(&drl->node);
+
+ if (likely(drl->seq != seq)) {
+ drl->seq = seq;
+ drl->cnt = 0;
+ } else {
+ if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) {
+ scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times",
+ drl->cnt);
+ skip = true;
+ }
+
+ __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1);
+ }
+ }
+
+ if (!skip) {
+ /* see schedule_dsq_reenq() */
+ smp_mb();
+
+ reenq_local(sch, rq, reenq_flags);
+ }
+ }
+}
+
+static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason)
+{
+ *reason = SCX_TASK_REENQ_KFUNC;
+ return reenq_flags & SCX_REENQ_ANY;
+}
+
+static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
+{
+ struct rq *locked_rq = rq;
+ struct scx_sched *sch = dsq->sched;
+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
+ struct task_struct *p;
+ s32 nr_enqueued = 0;
+
+ lockdep_assert_rq_held(rq);
+
+ raw_spin_lock(&dsq->lock);
+
+ while (likely(!READ_ONCE(sch->bypass_depth))) {
+ struct rq *task_rq;
+ u32 reason;
+
+ p = nldsq_cursor_next_task(&cursor, dsq);
+ if (!p)
+ break;
+
+ if (!user_task_should_reenq(p, reenq_flags, &reason))
+ continue;
+
+ task_rq = task_rq(p);
+
+ if (locked_rq != task_rq) {
+ if (locked_rq)
+ raw_spin_rq_unlock(locked_rq);
+ if (unlikely(!raw_spin_rq_trylock(task_rq))) {
+ raw_spin_unlock(&dsq->lock);
+ raw_spin_rq_lock(task_rq);
+ raw_spin_lock(&dsq->lock);
+ }
+ locked_rq = task_rq;
+
+ /* did we lose @p while switching locks? */
+ if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
+ continue;
+ }
+
+ /* @p is on @dsq, its rq and @dsq are locked */
+ dispatch_dequeue_locked(p, dsq);
+ raw_spin_unlock(&dsq->lock);
+
+ if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ p->scx.flags |= reason;
+
+ do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
+
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+
+ if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
+ raw_spin_rq_unlock(locked_rq);
+ locked_rq = NULL;
+ cpu_relax();
+ }
+
+ raw_spin_lock(&dsq->lock);
+ }
+
+ list_del_init(&cursor.node);
+ raw_spin_unlock(&dsq->lock);
+
+ if (locked_rq != rq) {
+ if (locked_rq)
+ raw_spin_rq_unlock(locked_rq);
+ raw_spin_rq_lock(rq);
+ }
+}
+
+static void process_deferred_reenq_users(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ while (true) {
+ struct scx_dispatch_q *dsq;
+ u64 reenq_flags;
+
+ scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+ struct scx_deferred_reenq_user *dru =
+ list_first_entry_or_null(&rq->scx.deferred_reenq_users,
+ struct scx_deferred_reenq_user,
+ node);
+ struct scx_dsq_pcpu *dsq_pcpu;
+
+ if (!dru)
+ return;
+
+ dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
+ deferred_reenq_user);
+ dsq = dsq_pcpu->dsq;
+ reenq_flags = dru->flags;
+ WRITE_ONCE(dru->flags, 0);
+ list_del_init(&dru->node);
+ }
+
+ /* see schedule_dsq_reenq() */
+ smp_mb();
+
+ BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
+ reenq_user(rq, dsq, reenq_flags);
+ }
+}
+
+static void run_deferred(struct rq *rq)
+{
+ process_ddsp_deferred_locals(rq);
+
+ if (!list_empty(&rq->scx.deferred_reenq_locals))
+ process_deferred_reenq_locals(rq);
+
+ if (!list_empty(&rq->scx.deferred_reenq_users))
+ process_deferred_reenq_users(rq);
+}
+
#ifdef CONFIG_NO_HZ_FULL
bool scx_can_stop_tick(struct rq *rq)
{
struct task_struct *p = rq->curr;
-
- if (scx_rq_bypassing(rq))
- return false;
+ struct scx_sched *sch = scx_task_sched(p);
if (p->sched_class != &ext_sched_class)
return true;
+ if (scx_bypassing(sch, cpu_of(rq)))
+ return false;
+
/*
* @rq can dispatch from different DSQs, so we can't tell whether it
* needs the tick or not by looking at nr_running. Allow stopping ticks
@@ -3238,7 +4211,7 @@ int scx_tg_online(struct task_group *tg)
.bw_quota_us = tg->scx.bw_quota_us,
.bw_burst_us = tg->scx.bw_burst_us };
- ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
+ ret = SCX_CALL_OP_RET(sch, cgroup_init,
NULL, tg->css.cgroup, &args);
if (ret)
ret = ops_sanitize_err(sch, "cgroup_init", ret);
@@ -3260,8 +4233,7 @@ void scx_tg_offline(struct task_group *tg)
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
(tg->scx.flags & SCX_TG_INITED))
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
- tg->css.cgroup);
+ SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup);
tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
}
@@ -3290,8 +4262,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
continue;
if (SCX_HAS_OP(sch, cgroup_prep_move)) {
- ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
- cgroup_prep_move, NULL,
+ ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL,
p, from, css->cgroup);
if (ret)
goto err;
@@ -3306,7 +4277,7 @@ err:
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
p->scx.cgrp_moving_from)
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ SCX_CALL_OP(sch, cgroup_cancel_move, NULL,
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
@@ -3327,7 +4298,7 @@ void scx_cgroup_move_task(struct task_struct *p)
*/
if (SCX_HAS_OP(sch, cgroup_move) &&
!WARN_ON_ONCE(!p->scx.cgrp_moving_from))
- SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
+ SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p),
p, p->scx.cgrp_moving_from,
tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
@@ -3345,7 +4316,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
p->scx.cgrp_moving_from)
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ SCX_CALL_OP(sch, cgroup_cancel_move, NULL,
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
@@ -3359,8 +4330,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
tg->scx.weight != weight)
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
- tg_cgrp(tg), weight);
+ SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight);
tg->scx.weight = weight;
@@ -3374,8 +4344,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL,
- tg_cgrp(tg), idle);
+ SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle);
/* Update the task group's idle state */
tg->scx.idle = idle;
@@ -3394,7 +4363,7 @@ void scx_group_set_bandwidth(struct task_group *tg,
(tg->scx.bw_period_us != period_us ||
tg->scx.bw_quota_us != quota_us ||
tg->scx.bw_burst_us != burst_us))
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+ SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL,
tg_cgrp(tg), period_us, quota_us, burst_us);
tg->scx.bw_period_us = period_us;
@@ -3403,33 +4372,55 @@ void scx_group_set_bandwidth(struct task_group *tg,
percpu_up_read(&scx_cgroup_ops_rwsem);
}
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+static struct cgroup *root_cgroup(void)
+{
+ return &cgrp_dfl_root.cgrp;
+}
+
+static struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+ return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+ struct cgroup *pos;
+ struct cgroup_subsys_state *css;
+
+ cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+ rcu_assign_pointer(pos->scx_sched, sch);
+}
static void scx_cgroup_lock(void)
{
+#ifdef CONFIG_EXT_GROUP_SCHED
percpu_down_write(&scx_cgroup_ops_rwsem);
+#endif
cgroup_lock();
}
static void scx_cgroup_unlock(void)
{
cgroup_unlock();
+#ifdef CONFIG_EXT_GROUP_SCHED
percpu_up_write(&scx_cgroup_ops_rwsem);
+#endif
}
-
-#else /* CONFIG_EXT_GROUP_SCHED */
-
+#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
+static struct cgroup *root_cgroup(void) { return NULL; }
+static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
static void scx_cgroup_lock(void) {}
static void scx_cgroup_unlock(void) {}
-
-#endif /* CONFIG_EXT_GROUP_SCHED */
+#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
/*
* Omitted operations:
*
- * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
- * isn't tied to the CPU at that point. Preemption is implemented by resetting
- * the victim task's slice to 0 and triggering reschedule on the target CPU.
- *
* - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
*
* - task_fork/dead: We need fork/dead notifications for all tasks regardless of
@@ -3470,13 +4461,60 @@ DEFINE_SCHED_CLASS(ext) = {
#endif
};
-static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
+ struct scx_sched *sch)
{
+ s32 cpu;
+
memset(dsq, 0, sizeof(*dsq));
raw_spin_lock_init(&dsq->lock);
INIT_LIST_HEAD(&dsq->list);
dsq->id = dsq_id;
+ dsq->sched = sch;
+
+ dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu);
+ if (!dsq->pcpu)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
+
+ pcpu->dsq = dsq;
+ INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node);
+ }
+
+ return 0;
+}
+
+static void exit_dsq(struct scx_dispatch_q *dsq)
+{
+ s32 cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
+ struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user;
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * There must have been a RCU grace period since the last
+ * insertion and @dsq should be off the deferred list by now.
+ */
+ if (WARN_ON_ONCE(!list_empty(&dru->node))) {
+ guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
+ list_del_init(&dru->node);
+ }
+ }
+
+ free_percpu(dsq->pcpu);
+}
+
+static void free_dsq_rcufn(struct rcu_head *rcu)
+{
+ struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu);
+
+ exit_dsq(dsq);
+ kfree(dsq);
}
static void free_dsq_irq_workfn(struct irq_work *irq_work)
@@ -3485,7 +4523,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
struct scx_dispatch_q *dsq, *tmp_dsq;
llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
- kfree_rcu(dsq, rcu);
+ call_rcu(&dsq->rcu, free_dsq_rcufn);
}
static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
@@ -3550,8 +4588,7 @@ static void scx_cgroup_exit(struct scx_sched *sch)
if (!sch->ops.cgroup_exit)
continue;
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
- css->cgroup);
+ SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup);
}
}
@@ -3582,10 +4619,9 @@ static int scx_cgroup_init(struct scx_sched *sch)
continue;
}
- ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
+ ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
- css_put(css);
scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
@@ -3662,6 +4698,7 @@ static const struct attribute_group scx_global_attr_group = {
.attrs = scx_global_attrs,
};
+static void free_pnode(struct scx_sched_pnode *pnode);
static void free_exit_info(struct scx_exit_info *ei);
static void scx_sched_free_rcu_work(struct work_struct *work)
@@ -3670,22 +4707,42 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
- int node;
+ int cpu, node;
- irq_work_sync(&sch->error_irq_work);
+ irq_work_sync(&sch->disable_irq_work);
kthread_destroy_worker(sch->helper);
+ timer_shutdown_sync(&sch->bypass_lb_timer);
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ kfree(sch->cgrp_path);
+ if (sch_cgroup(sch))
+ cgroup_put(sch_cgroup(sch));
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+ for_each_possible_cpu(cpu) {
+ struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+ /*
+ * $sch would have entered bypass mode before the RCU grace
+ * period. As that blocks new deferrals, all
+ * deferred_reenq_local_node's must be off-list by now.
+ */
+ WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node));
+
+ exit_dsq(bypass_dsq(sch, cpu));
+ }
free_percpu(sch->pcpu);
for_each_node_state(node, N_POSSIBLE)
- kfree(sch->global_dsqs[node]);
- kfree(sch->global_dsqs);
+ free_pnode(sch->pnode[node]);
+ kfree(sch->pnode);
rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
do {
rhashtable_walk_start(&rht_iter);
- while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter))))
destroy_dsq(sch, dsq->id);
rhashtable_walk_stop(&rht_iter);
@@ -3702,13 +4759,15 @@ static void scx_kobj_release(struct kobject *kobj)
struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
- queue_rcu_work(system_unbound_wq, &sch->rcu_work);
+ queue_rcu_work(system_dfl_wq, &sch->rcu_work);
}
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_root->ops.name);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ return sysfs_emit(buf, "%s\n", sch->ops.name);
}
SCX_ATTR(ops);
@@ -3729,10 +4788,14 @@ static ssize_t scx_attr_events_show(struct kobject *kobj,
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT);
at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH);
return at;
}
SCX_ATTR(events);
@@ -3752,7 +4815,19 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
+ const struct scx_sched *sch;
+
+ /*
+ * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype)
+ * and sub-scheduler kset kobjects (kset_ktype) through the parent
+ * chain walk. Filter out the latter to avoid invalid casts.
+ */
+ if (kobj->ktype != &scx_ktype)
+ return 0;
+
+ sch = container_of(kobj, struct scx_sched, kobj);
+
+ return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -3779,7 +4854,7 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
if (!scx_enabled())
return true;
- sch = rcu_dereference_sched(scx_root);
+ sch = scx_task_sched(p);
if (unlikely(!sch))
return true;
@@ -3872,7 +4947,7 @@ void scx_softlockup(u32 dur_s)
* a good state before taking more drastic actions.
*
* Returns %true if sched_ext is enabled and abort was initiated, which may
- * resolve the reported hardlockdup. %false if sched_ext is not enabled or
+ * resolve the reported hardlockup. %false if sched_ext is not enabled or
* someone else already initiated abort.
*/
bool scx_hardlockup(int cpu)
@@ -3885,13 +4960,14 @@ bool scx_hardlockup(int cpu)
return true;
}
-static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor,
struct cpumask *donee_mask, struct cpumask *resched_mask,
u32 nr_donor_target, u32 nr_donee_target)
{
- struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+ struct rq *donor_rq = cpu_rq(donor);
+ struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor);
struct task_struct *p, *n;
- struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0);
s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
u32 nr_balanced = 0, min_delta_us;
@@ -3901,11 +4977,11 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
* consider offloading iff the total queued duration is over the
* threshold.
*/
- min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
- if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
return 0;
- raw_spin_rq_lock_irq(rq);
+ raw_spin_rq_lock_irq(donor_rq);
raw_spin_lock(&donor_dsq->lock);
list_add(&cursor.node, &donor_dsq->list);
resume:
@@ -3913,7 +4989,6 @@ resume:
n = nldsq_next_task(donor_dsq, n, false);
while ((p = n)) {
- struct rq *donee_rq;
struct scx_dispatch_q *donee_dsq;
int donee;
@@ -3929,14 +5004,13 @@ resume:
if (donee >= nr_cpu_ids)
continue;
- donee_rq = cpu_rq(donee);
- donee_dsq = &donee_rq->scx.bypass_dsq;
+ donee_dsq = bypass_dsq(sch, donee);
/*
* $p's rq is not locked but $p's DSQ lock protects its
* scheduling properties making this test safe.
*/
- if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+ if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false))
continue;
/*
@@ -3951,7 +5025,7 @@ resume:
* between bypass DSQs.
*/
dispatch_dequeue_locked(p, donor_dsq);
- dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+ dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED);
/*
* $donee might have been idle and need to be woken up. No need
@@ -3966,9 +5040,9 @@ resume:
if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
list_move_tail(&cursor.node, &n->scx.dsq_list.node);
raw_spin_unlock(&donor_dsq->lock);
- raw_spin_rq_unlock_irq(rq);
+ raw_spin_rq_unlock_irq(donor_rq);
cpu_relax();
- raw_spin_rq_lock_irq(rq);
+ raw_spin_rq_lock_irq(donor_rq);
raw_spin_lock(&donor_dsq->lock);
goto resume;
}
@@ -3976,7 +5050,7 @@ resume:
list_del_init(&cursor.node);
raw_spin_unlock(&donor_dsq->lock);
- raw_spin_rq_unlock_irq(rq);
+ raw_spin_rq_unlock_irq(donor_rq);
return nr_balanced;
}
@@ -3994,7 +5068,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
/* count the target tasks and CPUs */
for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
- u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+ u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
nr_tasks += nr;
nr_cpus++;
@@ -4016,24 +5090,21 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
cpumask_clear(donee_mask);
for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
- if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+ if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target)
cpumask_set_cpu(cpu, donee_mask);
}
/* iterate !donee CPUs and see if they should be offloaded */
cpumask_clear(resched_mask);
for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
- struct rq *rq = cpu_rq(cpu);
- struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
-
if (cpumask_empty(donee_mask))
break;
if (cpumask_test_cpu(cpu, donee_mask))
continue;
- if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+ if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target)
continue;
- nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+ nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask,
nr_donor_target, nr_target);
}
@@ -4041,7 +5112,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
resched_cpu(cpu);
for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
- u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+ u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
after_min = min(nr, after_min);
after_max = max(nr, after_max);
@@ -4063,12 +5134,11 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
*/
static void scx_bypass_lb_timerfn(struct timer_list *timer)
{
- struct scx_sched *sch;
+ struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer);
int node;
u32 intv_us;
- sch = rcu_dereference_all(scx_root);
- if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth))
+ if (!bypass_dsp_enabled(sch))
return;
for_each_node_with_cpus(node)
@@ -4079,10 +5149,102 @@ static void scx_bypass_lb_timerfn(struct timer_list *timer)
mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
}
-static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
+static bool inc_bypass_depth(struct scx_sched *sch)
+{
+ lockdep_assert_held(&scx_bypass_lock);
+
+ WARN_ON_ONCE(sch->bypass_depth < 0);
+ WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1);
+ if (sch->bypass_depth != 1)
+ return false;
+
+ WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
+ sch->bypass_timestamp = ktime_get_ns();
+ scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+ return true;
+}
+
+static bool dec_bypass_depth(struct scx_sched *sch)
+{
+ lockdep_assert_held(&scx_bypass_lock);
+
+ WARN_ON_ONCE(sch->bypass_depth < 1);
+ WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1);
+ if (sch->bypass_depth != 0)
+ return false;
+
+ WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
+ scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - sch->bypass_timestamp);
+ return true;
+}
+
+static void enable_bypass_dsp(struct scx_sched *sch)
+{
+ struct scx_sched *host = scx_parent(sch) ?: sch;
+ u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ s32 ret;
+
+ /*
+ * @sch->bypass_depth transitioning from 0 to 1 triggers enabling.
+ * Shouldn't stagger.
+ */
+ if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim)))
+ return;
+
+ /*
+ * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of
+ * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is
+ * called iff @sch is not already bypassed due to an ancestor bypassing,
+ * we can assume that the parent is not bypassing and thus will be the
+ * host of the bypass DSQs.
+ *
+ * While the situation may change in the future, the following
+ * guarantees that the nearest non-bypassing ancestor or root has bypass
+ * dispatch enabled while a descendant is bypassing, which is all that's
+ * required.
+ *
+ * bypass_dsp_enabled() test is used to determine whether to enter the
+ * bypass dispatch handling path from both bypassing and hosting scheds.
+ * Bump enable depth on both @sch and bypass dispatch host.
+ */
+ ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
+ WARN_ON_ONCE(ret <= 0);
+
+ if (host != sch) {
+ ret = atomic_inc_return(&host->bypass_dsp_enable_depth);
+ WARN_ON_ONCE(ret <= 0);
+ }
+
+ /*
+ * The LB timer will stop running if bypass dispatch is disabled. Start
+ * after enabling bypass dispatch.
+ */
+ if (intv_us && !timer_pending(&host->bypass_lb_timer))
+ mod_timer(&host->bypass_lb_timer,
+ jiffies + usecs_to_jiffies(intv_us));
+}
+
+/* may be called without holding scx_bypass_lock */
+static void disable_bypass_dsp(struct scx_sched *sch)
+{
+ s32 ret;
+
+ if (!test_and_clear_bit(0, &sch->bypass_dsp_claim))
+ return;
+
+ ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
+ WARN_ON_ONCE(ret < 0);
+
+ if (scx_parent(sch)) {
+ ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth);
+ WARN_ON_ONCE(ret < 0);
+ }
+}
/**
* scx_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * @sch: sched to bypass
* @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
@@ -4112,49 +5274,42 @@ static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
*
* - scx_prio_less() reverts to the default core_sched_at order.
*/
-static void scx_bypass(bool bypass)
+static void scx_bypass(struct scx_sched *sch, bool bypass)
{
- static DEFINE_RAW_SPINLOCK(bypass_lock);
- static unsigned long bypass_timestamp;
- struct scx_sched *sch;
+ struct scx_sched *pos;
unsigned long flags;
int cpu;
- raw_spin_lock_irqsave(&bypass_lock, flags);
- sch = rcu_dereference_bh(scx_root);
+ raw_spin_lock_irqsave(&scx_bypass_lock, flags);
if (bypass) {
- u32 intv_us;
-
- WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1);
- WARN_ON_ONCE(scx_bypass_depth <= 0);
- if (scx_bypass_depth != 1)
+ if (!inc_bypass_depth(sch))
goto unlock;
- WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
- bypass_timestamp = ktime_get_ns();
- if (sch)
- scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
-
- intv_us = READ_ONCE(scx_bypass_lb_intv_us);
- if (intv_us && !timer_pending(&scx_bypass_lb_timer)) {
- scx_bypass_lb_timer.expires =
- jiffies + usecs_to_jiffies(intv_us);
- add_timer_global(&scx_bypass_lb_timer);
- }
+
+ enable_bypass_dsp(sch);
} else {
- WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1);
- WARN_ON_ONCE(scx_bypass_depth < 0);
- if (scx_bypass_depth != 0)
+ if (!dec_bypass_depth(sch))
goto unlock;
- WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL);
- if (sch)
- scx_add_event(sch, SCX_EV_BYPASS_DURATION,
- ktime_get_ns() - bypass_timestamp);
}
/*
+ * Bypass state is propagated to all descendants - an scx_sched bypasses
+ * if itself or any of its ancestors are in bypass mode.
+ */
+ raw_spin_lock(&scx_sched_lock);
+ scx_for_each_descendant_pre(pos, sch) {
+ if (pos == sch)
+ continue;
+ if (bypass)
+ inc_bypass_depth(pos);
+ else
+ dec_bypass_depth(pos);
+ }
+ raw_spin_unlock(&scx_sched_lock);
+
+ /*
* No task property is changing. We just need to make sure all currently
- * queued tasks are re-queued according to the new scx_rq_bypassing()
+ * queued tasks are re-queued according to the new scx_bypassing()
* state. As an optimization, walk each rq's runnable_list instead of
* the scx_tasks list.
*
@@ -4166,19 +5321,23 @@ static void scx_bypass(bool bypass)
struct task_struct *p, *n;
raw_spin_rq_lock(rq);
+ raw_spin_lock(&scx_sched_lock);
- if (bypass) {
- WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
- rq->scx.flags |= SCX_RQ_BYPASSING;
- } else {
- WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
- rq->scx.flags &= ~SCX_RQ_BYPASSING;
+ scx_for_each_descendant_pre(pos, sch) {
+ struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
+
+ if (pos->bypass_depth)
+ pcpu->flags |= SCX_SCHED_PCPU_BYPASSING;
+ else
+ pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
}
+ raw_spin_unlock(&scx_sched_lock);
+
/*
* We need to guarantee that no tasks are on the BPF scheduler
* while bypassing. Either we see enabled or the enable path
- * sees scx_rq_bypassing() before moving tasks to SCX.
+ * sees scx_bypassing() before moving tasks to SCX.
*/
if (!scx_enabled()) {
raw_spin_rq_unlock(rq);
@@ -4194,6 +5353,9 @@ static void scx_bypass(bool bypass)
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) {
+ if (!scx_is_descendant(scx_task_sched(p), sch))
+ continue;
+
/* cycling deq/enq is enough, see the function comment */
scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
/* nothing */ ;
@@ -4207,8 +5369,11 @@ static void scx_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
+ /* disarming must come after moving all tasks out of the bypass DSQs */
+ if (!bypass)
+ disable_bypass_dsp(sch);
unlock:
- raw_spin_unlock_irqrestore(&bypass_lock, flags);
+ raw_spin_unlock_irqrestore(&scx_bypass_lock, flags);
}
static void free_exit_info(struct scx_exit_info *ei)
@@ -4250,6 +5415,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
return "unregistered from the main kernel";
case SCX_EXIT_SYSRQ:
return "disabled by sysrq-S";
+ case SCX_EXIT_PARENT:
+ return "parent exiting";
case SCX_EXIT_ERROR:
return "runtime error";
case SCX_EXIT_ERROR_BPF:
@@ -4275,28 +5442,279 @@ static void free_kick_syncs(void)
}
}
-static void scx_disable_workfn(struct kthread_work *work)
+static void refresh_watchdog(void)
{
- struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
- struct scx_exit_info *ei = sch->exit_info;
+ struct scx_sched *sch;
+ unsigned long intv = ULONG_MAX;
+
+ /* take the shortest timeout and use its half for watchdog interval */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+ rcu_read_unlock();
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ WRITE_ONCE(scx_watchdog_interval, intv);
+
+ if (intv < ULONG_MAX)
+ mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
+ else
+ cancel_delayed_work_sync(&scx_watchdog_work);
+}
+
+static s32 scx_link_sched(struct scx_sched *sch)
+{
+ scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+#ifdef CONFIG_EXT_SUB_SCHED
+ struct scx_sched *parent = scx_parent(sch);
+ s32 ret;
+
+ if (parent) {
+ /*
+ * scx_claim_exit() propagates exit_kind transition to
+ * its sub-scheds while holding scx_sched_lock - either
+ * we can see the parent's non-NONE exit_kind or the
+ * parent can shoot us down.
+ */
+ if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) {
+ scx_error(sch, "parent disabled");
+ return -ENOENT;
+ }
+
+ ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
+ &sch->hash_node, scx_sched_hash_params);
+ if (ret) {
+ scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
+ return ret;
+ }
+
+ list_add_tail(&sch->sibling, &parent->children);
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+ list_add_tail_rcu(&sch->all, &scx_sched_all);
+ }
+
+ refresh_watchdog();
+ return 0;
+}
+
+static void scx_unlink_sched(struct scx_sched *sch)
+{
+ scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (scx_parent(sch)) {
+ rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
+ scx_sched_hash_params);
+ list_del_init(&sch->sibling);
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+ list_del_rcu(&sch->all);
+ }
+
+ refresh_watchdog();
+}
+
+/*
+ * Called to disable future dumps and wait for in-progress one while disabling
+ * @sch. Once @sch becomes empty during disable, there's no point in dumping it.
+ * This prevents calling dump ops on a dead sch.
+ */
+static void scx_disable_dump(struct scx_sched *sch)
+{
+ guard(raw_spinlock_irqsave)(&scx_dump_lock);
+ sch->dump_disabled = true;
+}
+
+#ifdef CONFIG_EXT_SUB_SCHED
+static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+
+static void drain_descendants(struct scx_sched *sch)
+{
+ /*
+ * Child scheds that finished the critical part of disabling will take
+ * themselves off @sch->children. Wait for it to drain. As propagation
+ * is recursive, empty @sch->children means that all proper descendant
+ * scheds reached unlinking stage.
+ */
+ wait_event(scx_unlink_waitq, list_empty(&sch->children));
+}
+
+static void scx_fail_parent(struct scx_sched *sch,
+ struct task_struct *failed, s32 fail_code)
+{
+ struct scx_sched *parent = scx_parent(sch);
struct scx_task_iter sti;
struct task_struct *p;
- int kind, cpu;
- kind = atomic_read(&sch->exit_kind);
- while (true) {
- if (kind == SCX_EXIT_DONE) /* already disabled? */
- return;
- WARN_ON_ONCE(kind == SCX_EXIT_NONE);
- if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
+ scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+ fail_code, failed->comm, failed->pid);
+
+ /*
+ * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+ * it. This may cause downstream failures on the BPF side but $parent is
+ * dying anyway.
+ */
+ scx_bypass(parent, true);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ scx_disable_and_exit_task(sch, p);
+ rcu_assign_pointer(p->scx.sched, parent);
+ }
+ }
+ scx_task_iter_stop(&sti);
+}
+
+static void scx_sub_disable(struct scx_sched *sch)
+{
+ struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int ret;
+
+ /*
+ * Guarantee forward progress and wait for descendants to be disabled.
+ * To limit disruptions, $parent is not bypassed. Tasks are fully
+ * prepped and then inserted back into $parent.
+ */
+ scx_bypass(sch, true);
+ drain_descendants(sch);
+
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
+ mutex_lock(&scx_enable_mutex);
+ percpu_down_write(&scx_fork_rwsem);
+ scx_cgroup_lock();
+
+ set_cgroup_sched(sch_cgroup(sch), parent);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /* filter out duplicate visits */
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ /*
+ * By the time control reaches here, all descendant schedulers
+ * should already have been disabled.
+ */
+ WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+
+ /*
+ * If $p is about to be freed, nothing prevents $sch from
+ * unloading before $p reaches sched_ext_free(). Disable and
+ * exit $p right away.
+ */
+ if (!tryget_task_struct(p)) {
+ scx_disable_and_exit_task(sch, p);
+ continue;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * $p is READY or ENABLED on @sch. Initialize for $parent,
+ * disable and exit from @sch, and then switch over to $parent.
+ *
+ * If a task fails to initialize for $parent, the only available
+ * action is disabling $parent too. While this allows disabling
+ * of a child sched to cause the parent scheduler to fail, the
+ * failure can only originate from ops.init_task() of the
+ * parent. A child can't directly affect the parent through its
+ * own failures.
+ */
+ ret = __scx_init_task(parent, p, false);
+ if (ret) {
+ scx_fail_parent(sch, p, ret);
+ put_task_struct(p);
break;
+ }
+
+ rq = task_rq_lock(p, &rf);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p is initialized for $parent and still attached to
+ * @sch. Disable and exit for @sch, switch over to
+ * $parent, override the state to READY to account for
+ * $p having already been initialized, and then enable.
+ */
+ scx_disable_and_exit_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_INIT);
+ rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_state(p, SCX_TASK_READY);
+ scx_enable_task(parent, p);
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
}
- ei->kind = kind;
- ei->reason = scx_exit_reason(ei->kind);
+ scx_task_iter_stop(&sti);
+
+ scx_disable_dump(sch);
- /* guarantee forward progress by bypassing scx_ops */
- scx_bypass(true);
- WRITE_ONCE(scx_aborting, false);
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ /*
+ * All tasks are moved off of @sch but there may still be on-going
+ * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+ * the expedited version as ancestors may be waiting in bypass mode.
+ * Also, tell the parent that there is no need to keep running bypass
+ * DSQs for us.
+ */
+ synchronize_rcu_expedited();
+ disable_bypass_dsp(sch);
+
+ scx_unlink_sched(sch);
+
+ mutex_unlock(&scx_enable_mutex);
+
+ /*
+ * @sch is now unlinked from the parent's children list. Notify and call
+ * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+ * after unlinking and releasing all locks. See scx_claim_exit().
+ */
+ wake_up_all(&scx_unlink_waitq);
+
+ if (parent->ops.sub_detach && sch->sub_attached) {
+ struct scx_sub_detach_args sub_detach_args = {
+ .ops = &sch->ops,
+ .cgroup_path = sch->cgrp_path,
+ };
+ SCX_CALL_OP(parent, sub_detach, NULL,
+ &sub_detach_args);
+ }
+
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
+ kobject_del(&sch->kobj);
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static void drain_descendants(struct scx_sched *sch) { }
+static void scx_sub_disable(struct scx_sched *sch) { }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+static void scx_root_disable(struct scx_sched *sch)
+{
+ struct scx_exit_info *ei = sch->exit_info;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int cpu;
+
+ /* guarantee forward progress and wait for descendants to be disabled */
+ scx_bypass(sch, true);
+ drain_descendants(sch);
switch (scx_set_enable_state(SCX_DISABLING)) {
case SCX_DISABLING:
@@ -4323,7 +5741,7 @@ static void scx_disable_workfn(struct kthread_work *work)
/*
* Shut down cgroup support before tasks so that the cgroup attach path
- * doesn't race against scx_exit_task().
+ * doesn't race against scx_disable_and_exit_task().
*/
scx_cgroup_lock();
scx_cgroup_exit(sch);
@@ -4337,7 +5755,7 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_init_task_enabled = false;
- scx_task_iter_start(&sti);
+ scx_task_iter_start(&sti, NULL);
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
@@ -4352,9 +5770,16 @@ static void scx_disable_workfn(struct kthread_work *work)
p->sched_class = new_class;
}
- scx_exit_task(p);
+ scx_disable_and_exit_task(scx_task_sched(p), p);
}
scx_task_iter_stop(&sti);
+
+ scx_disable_dump(sch);
+
+ scx_cgroup_lock();
+ set_cgroup_sched(sch_cgroup(sch), NULL);
+ scx_cgroup_unlock();
+
percpu_up_write(&scx_fork_rwsem);
/*
@@ -4387,9 +5812,9 @@ static void scx_disable_workfn(struct kthread_work *work)
}
if (sch->ops.exit)
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
+ SCX_CALL_OP(sch, exit, NULL, ei);
- cancel_delayed_work_sync(&scx_watchdog_work);
+ scx_unlink_sched(sch);
/*
* scx_root clearing must be inside cpus_read_lock(). See
@@ -4406,27 +5831,31 @@ static void scx_disable_workfn(struct kthread_work *work)
*/
kobject_del(&sch->kobj);
- free_percpu(scx_dsp_ctx);
- scx_dsp_ctx = NULL;
- scx_dsp_max_batch = 0;
free_kick_syncs();
- if (scx_bypassed_for_enable) {
- scx_bypassed_for_enable = false;
- scx_bypass(false);
- }
-
mutex_unlock(&scx_enable_mutex);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
done:
- scx_bypass(false);
+ scx_bypass(sch, false);
}
+/*
+ * Claim the exit on @sch. The caller must ensure that the helper kthread work
+ * is kicked before the current task can be preempted. Once exit_kind is
+ * claimed, scx_error() can no longer trigger, so if the current task gets
+ * preempted and the BPF scheduler fails to schedule it back, the helper work
+ * will never be kicked and the whole system can wedge.
+ */
static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ lockdep_assert_preemption_disabled();
+
+ if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
+ kind = SCX_EXIT_ERROR;
+
if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
return false;
@@ -4435,24 +5864,61 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
* flag to break potential live-lock scenarios, ensuring we can
* successfully reach scx_bypass().
*/
- WRITE_ONCE(scx_aborting, true);
+ WRITE_ONCE(sch->aborting, true);
+
+ /*
+ * Propagate exits to descendants immediately. Each has a dedicated
+ * helper kthread and can run in parallel. While most of disabling is
+ * serialized, running them in separate threads allows parallelizing
+ * ops.exit(), which can take arbitrarily long prolonging bypass mode.
+ *
+ * To guarantee forward progress, this propagation must be in-line so
+ * that ->aborting is synchronously asserted for all sub-scheds. The
+ * propagation is also the interlocking point against sub-sched
+ * attachment. See scx_link_sched().
+ *
+ * This doesn't cause recursions as propagation only takes place for
+ * non-propagation exits.
+ */
+ if (kind != SCX_EXIT_PARENT) {
+ scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) {
+ struct scx_sched *pos;
+ scx_for_each_descendant_pre(pos, sch)
+ scx_disable(pos, SCX_EXIT_PARENT);
+ }
+ }
+
return true;
}
-static void scx_disable(enum scx_exit_kind kind)
+static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_sched *sch;
-
- if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
- kind = SCX_EXIT_ERROR;
+ struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
+ struct scx_exit_info *ei = sch->exit_info;
+ int kind;
- rcu_read_lock();
- sch = rcu_dereference(scx_root);
- if (sch) {
- scx_claim_exit(sch, kind);
- kthread_queue_work(sch->helper, &sch->disable_work);
+ kind = atomic_read(&sch->exit_kind);
+ while (true) {
+ if (kind == SCX_EXIT_DONE) /* already disabled? */
+ return;
+ WARN_ON_ONCE(kind == SCX_EXIT_NONE);
+ if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
+ break;
}
- rcu_read_unlock();
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
+ if (scx_parent(sch))
+ scx_sub_disable(sch);
+ else
+ scx_root_disable(sch);
+}
+
+static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
+{
+ guard(preempt)();
+ if (scx_claim_exit(sch, kind))
+ irq_work_queue(&sch->disable_irq_work);
}
static void dump_newline(struct seq_buf *s)
@@ -4470,14 +5936,14 @@ static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
#ifdef CONFIG_TRACEPOINTS
if (trace_sched_ext_dump_enabled()) {
- /* protected by scx_dump_state()::dump_lock */
+ /* protected by scx_dump_lock */
static char line_buf[SCX_EXIT_MSG_LEN];
va_start(args, fmt);
vscnprintf(line_buf, sizeof(line_buf), fmt, args);
va_end(args);
- trace_sched_ext_dump(line_buf);
+ trace_call__sched_ext_dump(line_buf);
}
#endif
/* @s may be zero sized and seq_buf triggers WARN if so */
@@ -4566,25 +6032,38 @@ static void ops_dump_exit(void)
scx_dump_data.cpu = -1;
}
-static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
+static void scx_dump_task(struct scx_sched *sch,
+ struct seq_buf *s, struct scx_dump_ctx *dctx,
struct task_struct *p, char marker)
{
static unsigned long bt[SCX_EXIT_BT_LEN];
- struct scx_sched *sch = scx_root;
+ struct scx_sched *task_sch = scx_task_sched(p);
+ const char *own_marker;
+ char sch_id_buf[32];
char dsq_id_buf[19] = "(n/a)";
unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
unsigned int bt_len = 0;
+ own_marker = task_sch == sch ? "*" : "";
+
+ if (task_sch->level == 0)
+ scnprintf(sch_id_buf, sizeof(sch_id_buf), "root");
+ else
+ scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu",
+ task_sch->level, task_sch->ops.sub_cgroup_id);
+
if (p->scx.dsq)
scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
(unsigned long long)p->scx.dsq->id);
dump_newline(s);
- dump_line(s, " %c%c %s[%d] %+ldms",
+ dump_line(s, " %c%c %s[%d] %s%s %+ldms",
marker, task_state_to_char(p), p->comm, p->pid,
+ own_marker, sch_id_buf,
jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
- scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
+ scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT,
+ p->scx.flags & ~SCX_TASK_STATE_MASK,
p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
ops_state >> SCX_OPSS_QSEQ_SHIFT);
dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
@@ -4596,7 +6075,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p);
+ SCX_CALL_OP(sch, dump_task, NULL, dctx, p);
ops_dump_exit();
}
@@ -4609,11 +6088,17 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
}
}
-static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+/*
+ * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless
+ * of which scheduler they belong to. If false, only dump tasks owned by @sch.
+ * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped
+ * separately. For error dumps, @dump_all_tasks=true since only the failing
+ * scheduler is dumped.
+ */
+static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
+ size_t dump_len, bool dump_all_tasks)
{
- static DEFINE_SPINLOCK(dump_lock);
static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
- struct scx_sched *sch = scx_root;
struct scx_dump_ctx dctx = {
.kind = ei->kind,
.exit_code = ei->exit_code,
@@ -4623,14 +6108,24 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
};
struct seq_buf s;
struct scx_event_stats events;
- unsigned long flags;
char *buf;
int cpu;
- spin_lock_irqsave(&dump_lock, flags);
+ guard(raw_spinlock_irqsave)(&scx_dump_lock);
+
+ if (sch->dump_disabled)
+ return;
seq_buf_init(&s, ei->dump, dump_len);
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (sch->level == 0)
+ dump_line(&s, "%s: root", sch->ops.name);
+ else
+ dump_line(&s, "%s: sub%d-%llu %s",
+ sch->ops.name, sch->level, sch->ops.sub_cgroup_id,
+ sch->cgrp_path);
+#endif
if (ei->kind == SCX_EXIT_NONE) {
dump_line(&s, "Debug dump triggered by %s", ei->reason);
} else {
@@ -4644,7 +6139,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
if (SCX_HAS_OP(sch, dump)) {
ops_dump_init(&s, "");
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx);
+ SCX_CALL_OP(sch, dump, NULL, &dctx);
ops_dump_exit();
}
@@ -4697,11 +6192,14 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
if (!cpumask_empty(rq->scx.cpus_to_wait))
dump_line(&ns, " cpus_to_wait : %*pb",
cpumask_pr_args(rq->scx.cpus_to_wait));
+ if (!cpumask_empty(rq->scx.cpus_to_sync))
+ dump_line(&ns, " cpus_to_sync : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_sync));
used = seq_buf_used(&ns);
if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
+ SCX_CALL_OP(sch, dump_cpu, NULL,
&dctx, cpu, idle);
ops_dump_exit();
}
@@ -4723,11 +6221,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
seq_buf_set_overflow(&s);
}
- if (rq->curr->sched_class == &ext_sched_class)
- scx_dump_task(&s, &dctx, rq->curr, '*');
+ if (rq->curr->sched_class == &ext_sched_class &&
+ (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
+ scx_dump_task(sch, &s, &dctx, rq->curr, '*');
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
- scx_dump_task(&s, &dctx, p, ' ');
+ if (dump_all_tasks || scx_task_on_sched(sch, p))
+ scx_dump_task(sch, &s, &dctx, p, ' ');
next:
rq_unlock_irqrestore(rq, &rf);
}
@@ -4742,25 +6242,27 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_dump_event(s, &events, SCX_EV_REENQ_IMMED);
+ scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT);
scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL);
scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
+ scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED);
+ scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH);
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
memcpy(ei->dump + dump_len - sizeof(trunc_marker),
trunc_marker, sizeof(trunc_marker));
-
- spin_unlock_irqrestore(&dump_lock, flags);
}
-static void scx_error_irq_workfn(struct irq_work *irq_work)
+static void scx_disable_irq_workfn(struct irq_work *irq_work)
{
- struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+ struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work);
struct scx_exit_info *ei = sch->exit_info;
if (ei->kind >= SCX_EXIT_ERROR)
- scx_dump_state(ei, sch->ops.exit_dump_len);
+ scx_dump_state(sch, ei, sch->ops.exit_dump_len, true);
kthread_queue_work(sch->helper, &sch->disable_work);
}
@@ -4771,6 +6273,8 @@ static bool scx_vexit(struct scx_sched *sch,
{
struct scx_exit_info *ei = sch->exit_info;
+ guard(preempt)();
+
if (!scx_claim_exit(sch, kind))
return false;
@@ -4788,7 +6292,7 @@ static bool scx_vexit(struct scx_sched *sch,
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
- irq_work_queue(&sch->error_irq_work);
+ irq_work_queue(&sch->disable_irq_work);
return true;
}
@@ -4819,14 +6323,47 @@ static int alloc_kick_syncs(void)
return 0;
}
-static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+static void free_pnode(struct scx_sched_pnode *pnode)
+{
+ if (!pnode)
+ return;
+ exit_dsq(&pnode->global_dsq);
+ kfree(pnode);
+}
+
+static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node)
+{
+ struct scx_sched_pnode *pnode;
+
+ pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node);
+ if (!pnode)
+ return NULL;
+
+ if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) {
+ kfree(pnode);
+ return NULL;
+ }
+
+ return pnode;
+}
+
+/*
+ * Allocate and initialize a new scx_sched. @cgrp's reference is always
+ * consumed whether the function succeeds or fails.
+ */
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
+ struct cgroup *cgrp,
+ struct scx_sched *parent)
{
struct scx_sched *sch;
- int node, ret;
+ s32 level = parent ? parent->level + 1 : 0;
+ s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids;
- sch = kzalloc_obj(*sch);
- if (!sch)
- return ERR_PTR(-ENOMEM);
+ sch = kzalloc_flex(*sch, ancestors, level + 1);
+ if (!sch) {
+ ret = -ENOMEM;
+ goto err_put_cgrp;
+ }
sch->exit_info = alloc_exit_info(ops->exit_dump_len);
if (!sch->exit_info) {
@@ -4838,29 +6375,42 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
if (ret < 0)
goto err_free_ei;
- sch->global_dsqs = kzalloc_objs(sch->global_dsqs[0], nr_node_ids);
- if (!sch->global_dsqs) {
+ sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids);
+ if (!sch->pnode) {
ret = -ENOMEM;
goto err_free_hash;
}
for_each_node_state(node, N_POSSIBLE) {
- struct scx_dispatch_q *dsq;
-
- dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq) {
+ sch->pnode[node] = alloc_pnode(sch, node);
+ if (!sch->pnode[node]) {
ret = -ENOMEM;
- goto err_free_gdsqs;
+ goto err_free_pnode;
}
-
- init_dsq(dsq, SCX_DSQ_GLOBAL);
- sch->global_dsqs[node] = dsq;
}
- sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
+ sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
+ sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu,
+ dsp_ctx.buf, sch->dsp_max_batch),
+ __alignof__(struct scx_sched_pcpu));
if (!sch->pcpu) {
ret = -ENOMEM;
- goto err_free_gdsqs;
+ goto err_free_pnode;
+ }
+
+ for_each_possible_cpu(cpu) {
+ ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
+ if (ret) {
+ bypass_fail_cpu = cpu;
+ goto err_free_pcpu;
+ }
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+ pcpu->sch = sch;
+ INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node);
}
sch->helper = kthread_run_worker(0, "sched_ext_helper");
@@ -4871,33 +6421,98 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sched_set_fifo(sch->helper->task);
+ if (parent)
+ memcpy(sch->ancestors, parent->ancestors,
+ level * sizeof(parent->ancestors[0]));
+ sch->ancestors[level] = sch;
+ sch->level = level;
+
+ if (ops->timeout_ms)
+ sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+ else
+ sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
+ sch->slice_dfl = SCX_SLICE_DFL;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
- init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+ init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
kthread_init_work(&sch->disable_work, scx_disable_workfn);
+ timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
sch->ops = *ops;
- ops->priv = sch;
+ rcu_assign_pointer(ops->priv, sch);
sch->kobj.kset = scx_kset;
- ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
- if (ret < 0)
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto err_stop_helper;
+ }
+ cgroup_path(cgrp, buf, PATH_MAX);
+ sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
+ kfree(buf);
+ if (!sch->cgrp_path) {
+ ret = -ENOMEM;
goto err_stop_helper;
+ }
+
+ sch->cgrp = cgrp;
+ INIT_LIST_HEAD(&sch->children);
+ INIT_LIST_HEAD(&sch->sibling);
+
+ if (parent)
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype,
+ &parent->sub_kset->kobj,
+ "sub-%llu", cgroup_id(cgrp));
+ else
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0) {
+ kobject_put(&sch->kobj);
+ return ERR_PTR(ret);
+ }
+
+ if (ops->sub_attach) {
+ sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
+ if (!sch->sub_kset) {
+ kobject_put(&sch->kobj);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+#else /* CONFIG_EXT_SUB_SCHED */
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0) {
+ kobject_put(&sch->kobj);
+ return ERR_PTR(ret);
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
return sch;
+#ifdef CONFIG_EXT_SUB_SCHED
err_stop_helper:
kthread_destroy_worker(sch->helper);
+#endif
err_free_pcpu:
+ for_each_possible_cpu(cpu) {
+ if (cpu == bypass_fail_cpu)
+ break;
+ exit_dsq(bypass_dsq(sch, cpu));
+ }
free_percpu(sch->pcpu);
-err_free_gdsqs:
+err_free_pnode:
for_each_node_state(node, N_POSSIBLE)
- kfree(sch->global_dsqs[node]);
- kfree(sch->global_dsqs);
+ free_pnode(sch->pnode[node]);
+ kfree(sch->pnode);
err_free_hash:
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
err_free_ei:
free_exit_info(sch->exit_info);
err_free_sch:
kfree(sch);
+err_put_cgrp:
+#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+ cgroup_put(cgrp);
+#endif
return ERR_PTR(ret);
}
@@ -4946,29 +6561,35 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
return -EINVAL;
}
- if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
- pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
-
if (ops->cpu_acquire || ops->cpu_release)
pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
return 0;
}
-static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+/*
+ * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
+ * starvation. During the READY -> ENABLED task switching loop, the calling
+ * thread's sched_class gets switched from fair to ext. As fair has higher
+ * priority than ext, the calling thread can be indefinitely starved under
+ * fair-class saturation, leading to a system hang.
+ */
+struct scx_enable_cmd {
+ struct kthread_work work;
+ struct sched_ext_ops *ops;
+ int ret;
+};
+
+static void scx_root_enable_workfn(struct kthread_work *work)
{
+ struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
+ struct sched_ext_ops *ops = cmd->ops;
+ struct cgroup *cgrp = root_cgroup();
struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
- unsigned long timeout;
int i, cpu, ret;
- if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
- cpu_possible_mask)) {
- pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
- return -EINVAL;
- }
-
mutex_lock(&scx_enable_mutex);
if (scx_enable_state() != SCX_DISABLED) {
@@ -4980,7 +6601,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ret)
goto err_unlock;
- sch = scx_alloc_and_add_sched(ops);
+#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+ cgroup_get(cgrp);
+#endif
+ sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
goto err_free_ksyncs;
@@ -4992,13 +6616,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
WARN_ON_ONCE(scx_root);
- if (WARN_ON_ONCE(READ_ONCE(scx_aborting)))
- WRITE_ONCE(scx_aborting, false);
atomic_long_set(&scx_nr_rejected, 0);
- for_each_possible_cpu(cpu)
- cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->scx.local_dsq.sched = sch;
+ rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
+ }
/*
* Keep CPUs stable during enable so that the BPF scheduler can track
@@ -5012,10 +6638,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
rcu_assign_pointer(scx_root, sch);
+ ret = scx_link_sched(sch);
+ if (ret)
+ goto err_disable;
+
scx_idle_enable(ops);
if (sch->ops.init) {
- ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
+ ret = SCX_CALL_OP_RET(sch, init, NULL);
if (ret) {
ret = ops_sanitize_err(sch, "init", ret);
cpus_read_unlock();
@@ -5042,34 +6672,13 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ret)
goto err_disable;
- WARN_ON_ONCE(scx_dsp_ctx);
- scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
- scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
- scx_dsp_max_batch),
- __alignof__(struct scx_dsp_ctx));
- if (!scx_dsp_ctx) {
- ret = -ENOMEM;
- goto err_disable;
- }
-
- if (ops->timeout_ms)
- timeout = msecs_to_jiffies(ops->timeout_ms);
- else
- timeout = SCX_WATCHDOG_MAX_TIMEOUT;
-
- WRITE_ONCE(scx_watchdog_timeout, timeout);
- WRITE_ONCE(scx_watchdog_timestamp, jiffies);
- queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
- scx_watchdog_timeout / 2);
-
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace
* scheduling) may not function correctly before all tasks are switched.
* Init in bypass mode to guarantee forward progress.
*/
- scx_bypass(true);
- scx_bypassed_for_enable = true;
+ scx_bypass(sch, true);
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
@@ -5101,11 +6710,12 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* never sees uninitialized tasks.
*/
scx_cgroup_lock();
+ set_cgroup_sched(sch_cgroup(sch), sch);
ret = scx_cgroup_init(sch);
if (ret)
goto err_disable_unlock_all;
- scx_task_iter_start(&sti);
+ scx_task_iter_start(&sti, NULL);
while ((p = scx_task_iter_next_locked(&sti))) {
/*
* @p may already be dead, have lost all its usages counts and
@@ -5117,7 +6727,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_unlock(&sti);
- ret = scx_init_task(p, task_group(p), false);
+ ret = scx_init_task(sch, p, false);
if (ret) {
put_task_struct(p);
scx_task_iter_stop(&sti);
@@ -5126,6 +6736,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_disable_unlock_all;
}
+ scx_set_task_sched(p, sch);
scx_set_task_state(p, SCX_TASK_READY);
put_task_struct(p);
@@ -5147,7 +6758,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* scx_tasks_lock.
*/
percpu_down_write(&scx_fork_rwsem);
- scx_task_iter_start(&sti);
+ scx_task_iter_start(&sti, NULL);
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
@@ -5160,15 +6771,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
queue_flags |= DEQUEUE_CLASS;
scoped_guard (sched_change, p, queue_flags) {
- p->scx.slice = READ_ONCE(scx_slice_dfl);
+ p->scx.slice = READ_ONCE(sch->slice_dfl);
p->sched_class = new_class;
}
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
- scx_bypassed_for_enable = false;
- scx_bypass(false);
+ scx_bypass(sch, false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
@@ -5185,13 +6795,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
atomic_long_inc(&scx_enable_seq);
- return 0;
+ cmd->ret = 0;
+ return;
err_free_ksyncs:
free_kick_syncs();
err_unlock:
mutex_unlock(&scx_enable_mutex);
- return ret;
+ cmd->ret = ret;
+ return;
err_disable_unlock_all:
scx_cgroup_unlock();
@@ -5208,9 +6820,355 @@ err_disable:
* Flush scx_disable_work to ensure that error is reported before init
* completion. sch's base reference will be put by bpf_scx_unreg().
*/
- scx_error(sch, "scx_enable() failed (%d)", ret);
+ scx_error(sch, "scx_root_enable() failed (%d)", ret);
kthread_flush_work(&sch->disable_work);
- return 0;
+ cmd->ret = 0;
+}
+
+#ifdef CONFIG_EXT_SUB_SCHED
+/* verify that a scheduler can be attached to @cgrp and return the parent */
+static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
+{
+ struct scx_sched *parent = cgrp->scx_sched;
+ struct scx_sched *pos;
+
+ lockdep_assert_held(&scx_sched_lock);
+
+ /* can't attach twice to the same cgroup */
+ if (parent->cgrp == cgrp)
+ return ERR_PTR(-EBUSY);
+
+ /* does $parent allow sub-scheds? */
+ if (!parent->ops.sub_attach)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /* can't insert between $parent and its exiting children */
+ list_for_each_entry(pos, &parent->children, sibling)
+ if (cgroup_is_descendant(pos->cgrp, cgrp))
+ return ERR_PTR(-EBUSY);
+
+ return parent;
+}
+
+static bool assert_task_ready_or_enabled(struct task_struct *p)
+{
+ u32 state = scx_get_task_state(p);
+
+ switch (state) {
+ case SCX_TASK_READY:
+ case SCX_TASK_ENABLED:
+ return true;
+ default:
+ WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched",
+ state, p->comm, p->pid);
+ return false;
+ }
+}
+
+static void scx_sub_enable_workfn(struct kthread_work *work)
+{
+ struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
+ struct sched_ext_ops *ops = cmd->ops;
+ struct cgroup *cgrp;
+ struct scx_sched *parent, *sch;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ s32 i, ret;
+
+ mutex_lock(&scx_enable_mutex);
+
+ if (!scx_enabled()) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
+ if (IS_ERR(cgrp)) {
+ ret = PTR_ERR(cgrp);
+ goto out_unlock;
+ }
+
+ raw_spin_lock_irq(&scx_sched_lock);
+ parent = find_parent_sched(cgrp);
+ if (IS_ERR(parent)) {
+ raw_spin_unlock_irq(&scx_sched_lock);
+ ret = PTR_ERR(parent);
+ goto out_put_cgrp;
+ }
+ kobject_get(&parent->kobj);
+ raw_spin_unlock_irq(&scx_sched_lock);
+
+ /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */
+ sch = scx_alloc_and_add_sched(ops, cgrp, parent);
+ kobject_put(&parent->kobj);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
+ goto out_unlock;
+ }
+
+ ret = scx_link_sched(sch);
+ if (ret)
+ goto err_disable;
+
+ if (sch->level >= SCX_SUB_MAX_DEPTH) {
+ scx_error(sch, "max nesting depth %d violated",
+ SCX_SUB_MAX_DEPTH);
+ goto err_disable;
+ }
+
+ if (sch->ops.init) {
+ ret = SCX_CALL_OP_RET(sch, init, NULL);
+ if (ret) {
+ ret = ops_sanitize_err(sch, "init", ret);
+ scx_error(sch, "ops.init() failed (%d)", ret);
+ goto err_disable;
+ }
+ sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
+ }
+
+ if (validate_ops(sch, ops))
+ goto err_disable;
+
+ struct scx_sub_attach_args sub_attach_args = {
+ .ops = &sch->ops,
+ .cgroup_path = sch->cgrp_path,
+ };
+
+ ret = SCX_CALL_OP_RET(parent, sub_attach, NULL,
+ &sub_attach_args);
+ if (ret) {
+ ret = ops_sanitize_err(sch, "sub_attach", ret);
+ scx_error(sch, "parent rejected (%d)", ret);
+ goto err_disable;
+ }
+ sch->sub_attached = true;
+
+ scx_bypass(sch, true);
+
+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
+ if (((void (**)(void))ops)[i])
+ set_bit(i, sch->has_op);
+
+ percpu_down_write(&scx_fork_rwsem);
+ scx_cgroup_lock();
+
+ /*
+ * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see
+ * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down.
+ */
+ set_cgroup_sched(sch_cgroup(sch), sch);
+ if (!(cgrp->self.flags & CSS_ONLINE)) {
+ scx_error(sch, "cgroup is not online");
+ goto err_unlock_and_disable;
+ }
+
+ /*
+ * Initialize tasks for the new child $sch without exiting them for
+ * $parent so that the tasks can always be reverted back to $parent
+ * sched on child init failure.
+ */
+ WARN_ON_ONCE(scx_enabling_sub_sched);
+ scx_enabling_sub_sched = sch;
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /*
+ * Task iteration may visit the same task twice when racing
+ * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which
+ * finished __scx_init_task() and skip if set.
+ *
+ * A task may exit and get freed between __scx_init_task()
+ * completion and scx_enable_task(). In such cases,
+ * scx_disable_and_exit_task() must exit the task for both the
+ * parent and child scheds.
+ */
+ if (p->scx.flags & SCX_TASK_SUB_INIT)
+ continue;
+
+ /* see scx_root_enable() */
+ if (!tryget_task_struct(p))
+ continue;
+
+ if (!assert_task_ready_or_enabled(p)) {
+ ret = -EINVAL;
+ goto abort;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * As $p is still on $parent, it can't be transitioned to INIT.
+ * Let's worry about task state later. Use __scx_init_task().
+ */
+ ret = __scx_init_task(sch, p, false);
+ if (ret)
+ goto abort;
+
+ rq = task_rq_lock(p, &rf);
+ p->scx.flags |= SCX_TASK_SUB_INIT;
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
+
+ /*
+ * All tasks are prepped. Disable/exit tasks for $parent and enable for
+ * the new @sch.
+ */
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ /*
+ * Use clearing of %SCX_TASK_SUB_INIT to detect and skip
+ * duplicate iterations.
+ */
+ if (!(p->scx.flags & SCX_TASK_SUB_INIT))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p must be either READY or ENABLED. If ENABLED,
+ * __scx_disabled_and_exit_task() first disables and
+ * makes it READY. However, after exiting $p, it will
+ * leave $p as READY.
+ */
+ assert_task_ready_or_enabled(p);
+ __scx_disable_and_exit_task(parent, p);
+
+ /*
+ * $p is now only initialized for @sch and READY, which
+ * is what we want. Assign it to @sch and enable.
+ */
+ rcu_assign_pointer(p->scx.sched, sch);
+ scx_enable_task(sch, p);
+
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+ }
+ scx_task_iter_stop(&sti);
+
+ scx_enabling_sub_sched = NULL;
+
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ scx_bypass(sch, false);
+
+ pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
+ ret = 0;
+ goto out_unlock;
+
+out_put_cgrp:
+ cgroup_put(cgrp);
+out_unlock:
+ mutex_unlock(&scx_enable_mutex);
+ cmd->ret = ret;
+ return;
+
+abort:
+ put_task_struct(p);
+ scx_task_iter_stop(&sti);
+ scx_enabling_sub_sched = NULL;
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (p->scx.flags & SCX_TASK_SUB_INIT) {
+ __scx_disable_and_exit_task(sch, p);
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+ }
+ scx_task_iter_stop(&sti);
+err_unlock_and_disable:
+ /* we'll soon enter disable path, keep bypass on */
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+err_disable:
+ mutex_unlock(&scx_enable_mutex);
+ kthread_flush_work(&sch->disable_work);
+ cmd->ret = 0;
+}
+
+static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct cgroup *cgrp = data;
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ if (!cgroup_on_dfl(cgrp))
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CGROUP_LIFETIME_ONLINE:
+ /* inherit ->scx_sched from $parent */
+ if (parent)
+ rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched);
+ break;
+ case CGROUP_LIFETIME_OFFLINE:
+ /* if there is a sched attached, shoot it down */
+ if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp)
+ scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_RSN_CGROUP_OFFLINE,
+ "cgroup %llu going offline", cgroup_id(cgrp));
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block scx_cgroup_lifetime_nb = {
+ .notifier_call = scx_cgroup_lifetime_notify,
+};
+
+static s32 __init scx_cgroup_lifetime_notifier_init(void)
+{
+ return blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+ &scx_cgroup_lifetime_nb);
+}
+core_initcall(scx_cgroup_lifetime_notifier_init);
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+ static struct kthread_worker *helper;
+ static DEFINE_MUTEX(helper_mutex);
+ struct scx_enable_cmd cmd;
+
+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+ cpu_possible_mask)) {
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
+ return -EINVAL;
+ }
+
+ if (!READ_ONCE(helper)) {
+ mutex_lock(&helper_mutex);
+ if (!helper) {
+ struct kthread_worker *w =
+ kthread_run_worker(0, "scx_enable_helper");
+ if (IS_ERR_OR_NULL(w)) {
+ mutex_unlock(&helper_mutex);
+ return -ENOMEM;
+ }
+ sched_set_fifo(w->task);
+ WRITE_ONCE(helper, w);
+ }
+ mutex_unlock(&helper_mutex);
+ }
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (ops->sub_cgroup_id > 1)
+ kthread_init_work(&cmd.work, scx_sub_enable_workfn);
+ else
+#endif /* CONFIG_EXT_SUB_SCHED */
+ kthread_init_work(&cmd.work, scx_root_enable_workfn);
+ cmd.ops = ops;
+
+ kthread_queue_work(READ_ONCE(helper), &cmd.work);
+ kthread_flush_work(&cmd.work);
+ return cmd.ret;
}
@@ -5246,12 +7204,17 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
t = btf_type_by_id(reg->btf, reg->btf_id);
if (t == task_struct_type) {
- if (off >= offsetof(struct task_struct, scx.slice) &&
- off + size <= offsetofend(struct task_struct, scx.slice))
- return SCALAR_VALUE;
- if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
- off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
+ /*
+ * COMPAT: Will be removed in v6.23.
+ */
+ if ((off >= offsetof(struct task_struct, scx.slice) &&
+ off + size <= offsetofend(struct task_struct, scx.slice)) ||
+ (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
+ off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) {
+ pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()");
return SCALAR_VALUE;
+ }
+
if (off >= offsetof(struct task_struct, scx.disallow) &&
off + size <= offsetofend(struct task_struct, scx.disallow))
return SCALAR_VALUE;
@@ -5307,11 +7270,30 @@ static int bpf_scx_init_member(const struct btf_type *t,
case offsetof(struct sched_ext_ops, hotplug_seq):
ops->hotplug_seq = *(u64 *)(udata + moff);
return 1;
+#ifdef CONFIG_EXT_SUB_SCHED
+ case offsetof(struct sched_ext_ops, sub_cgroup_id):
+ ops->sub_cgroup_id = *(u64 *)(udata + moff);
+ return 1;
+#endif /* CONFIG_EXT_SUB_SCHED */
}
return 0;
}
+#ifdef CONFIG_EXT_SUB_SCHED
+static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = scx_prog_sched(prog->aux);
+ if (unlikely(!sch))
+ return;
+
+ scx_error(sch, "dispatch recursion detected");
+}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
static int bpf_scx_check_member(const struct btf_type *t,
const struct btf_member *member,
const struct bpf_prog *prog)
@@ -5329,12 +7311,30 @@ static int bpf_scx_check_member(const struct btf_type *t,
case offsetof(struct sched_ext_ops, cpu_offline):
case offsetof(struct sched_ext_ops, init):
case offsetof(struct sched_ext_ops, exit):
+ case offsetof(struct sched_ext_ops, sub_attach):
+ case offsetof(struct sched_ext_ops, sub_detach):
break;
default:
if (prog->sleepable)
return -EINVAL;
}
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * Enable private stack for operations that can nest along the
+ * hierarchy.
+ *
+ * XXX - Ideally, we should only do this for scheds that allow
+ * sub-scheds and sub-scheds themselves but I don't know how to access
+ * struct_ops from here.
+ */
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, dispatch):
+ prog->aux->priv_stack_requested = true;
+ prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
return 0;
}
@@ -5346,10 +7346,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
struct sched_ext_ops *ops = kdata;
- struct scx_sched *sch = ops->priv;
+ struct scx_sched *sch = rcu_dereference_protected(ops->priv, true);
- scx_disable(SCX_EXIT_UNREG);
+ scx_disable(sch, SCX_EXIT_UNREG);
kthread_flush_work(&sch->disable_work);
+ RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
}
@@ -5406,7 +7407,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
-#endif
+#endif /* CONFIG_EXT_GROUP_SCHED */
+static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; }
+static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {}
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
static s32 sched_ext_ops__init(void) { return -EINVAL; }
@@ -5446,6 +7449,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
.cgroup_set_idle = sched_ext_ops__cgroup_set_idle,
#endif
+ .sub_attach = sched_ext_ops__sub_attach,
+ .sub_detach = sched_ext_ops__sub_detach,
.cpu_online = sched_ext_ops__cpu_online,
.cpu_offline = sched_ext_ops__cpu_offline,
.init = sched_ext_ops__init,
@@ -5476,7 +7481,15 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
static void sysrq_handle_sched_ext_reset(u8 key)
{
- scx_disable(SCX_EXIT_SYSRQ);
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (likely(sch))
+ scx_disable(sch, SCX_EXIT_SYSRQ);
+ else
+ pr_info("sched_ext: BPF schedulers not loaded\n");
+ rcu_read_unlock();
}
static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
@@ -5489,9 +7502,10 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
static void sysrq_handle_sched_ext_dump(u8 key)
{
struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+ struct scx_sched *sch;
- if (scx_enabled())
- scx_dump_state(&ei, 0);
+ list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ scx_dump_state(sch, &ei, 0, false);
}
static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
@@ -5545,11 +7559,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
if (cur_class == &ext_sched_class) {
+ cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
ksyncs[cpu] = rq->scx.kick_sync;
should_wait = true;
- } else {
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
resched_curr(rq);
@@ -5586,10 +7600,9 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
unsigned long *ksyncs;
s32 cpu;
- if (unlikely(!ksyncs_pcpu)) {
- pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs");
+ /* can race with free_kick_syncs() during scheduler disable */
+ if (unlikely(!ksyncs_pcpu))
return;
- }
ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
@@ -5604,27 +7617,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
}
- if (!should_wait)
- return;
-
- for_each_cpu(cpu, this_scx->cpus_to_wait) {
- unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
-
- /*
- * Busy-wait until the task running at the time of kicking is no
- * longer running. This can be used to implement e.g. core
- * scheduling.
- *
- * smp_cond_load_acquire() pairs with store_releases in
- * pick_task_scx() and put_prev_task_scx(). The former breaks
- * the wait if SCX's scheduling path is entered even if the same
- * task is picked subsequently. The latter is necessary to break
- * the wait when $cpu is taken by a higher sched class.
- */
- if (cpu != cpu_of(this_rq))
- smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
-
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ /*
+ * Can't wait in hardirq — kick_sync can't advance, deadlocking if
+ * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
+ */
+ if (should_wait) {
+ raw_spin_rq_lock(this_rq);
+ this_scx->kick_sync_pending = true;
+ resched_curr(this_rq);
+ raw_spin_rq_unlock(this_rq);
}
}
@@ -5642,14 +7643,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
*/
void print_scx_info(const char *log_lvl, struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
enum scx_enable_state state = scx_enable_state();
const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
char runnable_at_buf[22] = "?";
struct sched_class *class;
unsigned long runnable_at;
- if (state == SCX_DISABLED)
+ guard(rcu)();
+
+ sch = scx_task_sched_rcu(p);
+
+ if (!sch)
return;
/*
@@ -5676,6 +7681,14 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (!sch)
+ return NOTIFY_OK;
+
/*
* SCX schedulers often have userspace components which are sometimes
* involved in critial scheduling paths. PM operations involve freezing
@@ -5686,12 +7699,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
case PM_RESTORE_PREPARE:
- scx_bypass(true);
+ scx_bypass(sch, true);
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
case PM_POST_RESTORE:
- scx_bypass(false);
+ scx_bypass(sch, false);
break;
}
@@ -5720,8 +7733,9 @@ void __init init_sched_ext_class(void)
struct rq *rq = cpu_rq(cpu);
int n = cpu_to_node(cpu);
- init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
- init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
+ /* local_dsq's sch will be set during scx_root_enable() */
+ BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL));
+
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
@@ -5729,6 +7743,10 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
+ raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
+ INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
+ INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);
rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
@@ -5739,18 +7757,36 @@ void __init init_sched_ext_class(void)
register_sysrq_key('S', &sysrq_sched_ext_reset_op);
register_sysrq_key('D', &sysrq_sched_ext_dump_op);
INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params));
+#endif /* CONFIG_EXT_SUB_SCHED */
}
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
- u64 enq_flags)
+static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags)
{
- if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
- return false;
+ bool is_local = dsq_id == SCX_DSQ_LOCAL ||
+ (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON;
+ if (*enq_flags & SCX_ENQ_IMMED) {
+ if (unlikely(!is_local)) {
+ scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id);
+ return false;
+ }
+ } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) {
+ *enq_flags |= SCX_ENQ_IMMED;
+ }
+
+ return true;
+}
+
+static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 *enq_flags)
+{
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
@@ -5758,18 +7794,27 @@ static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p
return false;
}
- if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
+ if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
+ scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags);
+ return false;
+ }
+
+ /* see SCX_EV_INSERT_NOT_OWNED definition */
+ if (unlikely(!scx_task_on_sched(sch, p))) {
+ __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1);
return false;
}
+ if (!scx_vet_enq_flags(sch, dsq_id, enq_flags))
+ return false;
+
return true;
}
static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
u64 dsq_id, u64 enq_flags)
{
- struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
struct task_struct *ddsp_task;
ddsp_task = __this_cpu_read(direct_dispatch_task);
@@ -5778,7 +7823,7 @@ static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
return;
}
- if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
+ if (unlikely(dspc->cursor >= sch->dsp_max_batch)) {
scx_error(sch, "dispatch buffer overflow");
return;
}
@@ -5799,6 +7844,7 @@ __bpf_kfunc_start_defs();
* @dsq_id: DSQ to insert into
* @slice: duration @p can run for in nsecs, 0 to keep the current value
* @enq_flags: SCX_ENQ_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
* call this function spuriously. Can be called from ops.enqueue(),
@@ -5833,16 +7879,17 @@ __bpf_kfunc_start_defs();
* to check the return value.
*/
__bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 enq_flags)
+ u64 slice, u64 enq_flags,
+ const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return false;
- if (!scx_dsq_insert_preamble(sch, p, enq_flags))
+ if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags))
return false;
if (slice)
@@ -5859,15 +7906,16 @@ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
* COMPAT: Will be removed in v6.23 along with the ___v2 suffix.
*/
__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 enq_flags)
+ u64 slice, u64 enq_flags,
+ const struct bpf_prog_aux *aux)
{
- scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags);
+ scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux);
}
static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(sch, p, enq_flags))
+ if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags))
return false;
if (slice)
@@ -5898,6 +7946,7 @@ struct scx_bpf_dsq_insert_vtime_args {
* @args->slice: duration @p can run for in nsecs, 0 to keep the current value
* @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
* @args->enq_flags: SCX_ENQ_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
* limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided
@@ -5922,13 +7971,14 @@ struct scx_bpf_dsq_insert_vtime_args {
*/
__bpf_kfunc bool
__scx_bpf_dsq_insert_vtime(struct task_struct *p,
- struct scx_bpf_dsq_insert_vtime_args *args)
+ struct scx_bpf_dsq_insert_vtime_args *args,
+ const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return false;
@@ -5950,44 +8000,61 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
if (unlikely(!sch))
return;
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * Disallow if any sub-scheds are attached. There is no way to tell
+ * which scheduler called us, just error out @p's scheduler.
+ */
+ if (unlikely(!list_empty(&sch->children))) {
+ scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used");
+ return;
+ }
+#endif
+
scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
-BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU)
-BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_enqueue_dispatch,
+ .filter = scx_kfunc_context_filter,
};
static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
- struct scx_sched *sch = scx_root;
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
+ struct scx_sched *sch = src_dsq->sched;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
bool in_balance;
unsigned long flags;
- if (!scx_kf_allowed_if_unlocked() &&
- !scx_kf_allowed(sch, SCX_KF_DISPATCH))
+ if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags))
return false;
/*
* If the BPF scheduler keeps calling this function repeatedly, it can
* cause similar live-lock conditions as consume_dispatch_q().
*/
- if (unlikely(READ_ONCE(scx_aborting)))
+ if (unlikely(READ_ONCE(sch->aborting)))
return false;
+ if (unlikely(!scx_task_on_sched(sch, p))) {
+ scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler",
+ p->comm, p->pid);
+ return false;
+ }
+
/*
* Can be called from either ops.dispatch() locking this_rq() or any
* context where no rq lock is held. If latter, lock @p's task_rq which
@@ -6011,20 +8078,14 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
- /*
- * Did someone else get to it? @p could have already left $src_dsq, got
- * re-enqueud, or be in the process of being consumed by someone else.
- */
- if (unlikely(p->scx.dsq != src_dsq ||
- u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
- p->scx.holding_cpu >= 0) ||
- WARN_ON_ONCE(src_rq != task_rq(p))) {
+ /* did someone else get to it while we dropped the locks? */
+ if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) {
raw_spin_unlock(&src_dsq->lock);
goto out;
}
/* @p is still on $src_dsq and stable, determine the destination */
- dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
+ dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p));
/*
* Apply vtime and slice updates before moving so that the new time is
@@ -6058,44 +8119,42 @@ __bpf_kfunc_start_defs();
/**
* scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Can only be called from ops.dispatch().
*/
-__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
+__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return 0;
- if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
- return 0;
-
- return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
+ return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor);
}
/**
* scx_bpf_dispatch_cancel - Cancel the latest dispatch
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Cancel the latest dispatch. Can be called multiple times to cancel further
* dispatches. Can only be called from ops.dispatch().
*/
-__bpf_kfunc void scx_bpf_dispatch_cancel(void)
+__bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
{
- struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_sched *sch;
+ struct scx_dsp_ctx *dspc;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return;
- if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
- return;
+ dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
if (dspc->cursor > 0)
dspc->cursor--;
@@ -6105,10 +8164,21 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
/**
* scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
- * @dsq_id: DSQ to move task from
+ * @dsq_id: DSQ to move task from. Must be a user-created DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ * @enq_flags: %SCX_ENQ_*
*
* Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
- * local DSQ for execution. Can only be called from ops.dispatch().
+ * local DSQ for execution with @enq_flags applied. Can only be called from
+ * ops.dispatch().
+ *
+ * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as
+ * sources. Local DSQs support reenqueueing (a task can be picked up for
+ * execution, dequeued for property changes, or reenqueued), but the BPF
+ * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL
+ * is similar but also doesn't support reenqueueing, as it maps to multiple
+ * per-node DSQs making the scope difficult to define; this may change in the
+ * future.
*
* This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
* before trying to move from the specified DSQ. It may also grab rq locks and
@@ -6117,21 +8187,24 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
* Returns %true if a task has been moved, %false if there isn't any task to
* move.
*/
-__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
+__bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags,
+ const struct bpf_prog_aux *aux)
{
- struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
struct scx_sched *sch;
+ struct scx_dsp_ctx *dspc;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return false;
- if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
+ if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags))
return false;
+ dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
+
flush_dispatch_buf(sch, dspc->rq);
dsq = find_user_dsq(sch, dsq_id);
@@ -6140,7 +8213,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
return false;
}
- if (consume_dispatch_q(sch, dspc->rq, dsq)) {
+ if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) {
/*
* A successfully consumed task can be dequeued before it starts
* running while the CPU is trying to migrate other dispatched
@@ -6154,6 +8227,14 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
}
}
+/*
+ * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future.
+ */
+__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux)
+{
+ return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux);
+}
+
/**
* scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6249,105 +8330,104 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
+ * @cgroup_id: cgroup ID of the child scheduler to dispatch
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Allows a parent scheduler to trigger dispatching on one of its direct
+ * child schedulers. The child scheduler runs its dispatch operation to
+ * move tasks from dispatch queues to the local runqueue.
+ *
+ * Returns: true on success, false if cgroup_id is invalid, not a direct
+ * child, or caller lacks dispatch permission.
+ */
+__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
+{
+ struct rq *this_rq = this_rq();
+ struct scx_sched *parent, *child;
+
+ guard(rcu)();
+ parent = scx_prog_sched(aux);
+ if (unlikely(!parent))
+ return false;
+
+ child = scx_find_sub_sched(cgroup_id);
+
+ if (unlikely(!child))
+ return false;
+
+ if (unlikely(scx_parent(child) != parent)) {
+ scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
+ cgroup_id);
+ return false;
+ }
+
+ return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
+ true);
+}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS)
+/* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+#ifdef CONFIG_EXT_SUB_SCHED
+BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS)
+#endif
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_dispatch,
+ .filter = scx_kfunc_context_filter,
};
-static u32 reenq_local(struct rq *rq)
-{
- LIST_HEAD(tasks);
- u32 nr_enqueued = 0;
- struct task_struct *p, *n;
-
- lockdep_assert_rq_held(rq);
-
- /*
- * The BPF scheduler may choose to dispatch tasks back to
- * @rq->scx.local_dsq. Move all candidate tasks off to a private list
- * first to avoid processing the same tasks repeatedly.
- */
- list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
- scx.dsq_list.node) {
- /*
- * If @p is being migrated, @p's current CPU may not agree with
- * its allowed CPUs and the migration_cpu_stop is about to
- * deactivate and re-activate @p anyway. Skip re-enqueueing.
- *
- * While racing sched property changes may also dequeue and
- * re-enqueue a migrating task while its current CPU and allowed
- * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
- * the current local DSQ for running tasks and thus are not
- * visible to the BPF scheduler.
- */
- if (p->migration_pending)
- continue;
-
- dispatch_dequeue(rq, p);
- list_add_tail(&p->scx.dsq_list.node, &tasks);
- }
-
- list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
- list_del_init(&p->scx.dsq_list.node);
- do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
- nr_enqueued++;
- }
-
- return nr_enqueued;
-}
-
__bpf_kfunc_start_defs();
/**
* scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Iterate over all of the tasks currently enqueued on the local DSQ of the
* caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
* processed tasks. Can only be called from ops.cpu_release().
- *
- * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
- * returning variant that can be called from anywhere.
*/
-__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+__bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
struct rq *rq;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return 0;
- if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
- return 0;
-
rq = cpu_rq(smp_processor_id());
lockdep_assert_rq_held(rq);
- return reenq_local(rq);
+ return reenq_local(sch, rq, SCX_REENQ_ANY);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
-BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS)
BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_cpu_release,
+ .filter = scx_kfunc_context_filter,
};
__bpf_kfunc_start_defs();
@@ -6356,11 +8436,12 @@ __bpf_kfunc_start_defs();
* scx_bpf_create_dsq - Create a custom DSQ
* @dsq_id: DSQ to create
* @node: NUMA node to allocate from
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
* scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
*/
-__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux)
{
struct scx_dispatch_q *dsq;
struct scx_sched *sch;
@@ -6377,36 +8458,54 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
if (!dsq)
return -ENOMEM;
- init_dsq(dsq, dsq_id);
+ /*
+ * init_dsq() must be called in GFP_KERNEL context. Init it with NULL
+ * @sch and update afterwards.
+ */
+ ret = init_dsq(dsq, dsq_id, NULL);
+ if (ret) {
+ kfree(dsq);
+ return ret;
+ }
rcu_read_lock();
- sch = rcu_dereference(scx_root);
- if (sch)
+ sch = scx_prog_sched(aux);
+ if (sch) {
+ dsq->sched = sch;
ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
dsq_hash_params);
- else
+ } else {
ret = -ENODEV;
+ }
rcu_read_unlock();
- if (ret)
+ if (ret) {
+ exit_dsq(dsq);
kfree(dsq);
+ }
return ret;
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
-BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE)
+/* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+/* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_unlocked,
+ .filter = scx_kfunc_context_filter,
};
__bpf_kfunc_start_defs();
@@ -6415,12 +8514,21 @@ __bpf_kfunc_start_defs();
* scx_bpf_task_set_slice - Set task's time slice
* @p: task of interest
* @slice: time slice to set in nsecs
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Set @p's time slice to @slice. Returns %true on success, %false if the
* calling scheduler doesn't have authority over @p.
*/
-__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
+__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice,
+ const struct bpf_prog_aux *aux)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = scx_prog_sched(aux);
+ if (unlikely(!scx_task_on_sched(sch, p)))
+ return false;
+
p->scx.slice = slice;
return true;
}
@@ -6429,12 +8537,21 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
* scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering
* @p: task of interest
* @vtime: virtual time to set
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Set @p's virtual time to @vtime. Returns %true on success, %false if the
* calling scheduler doesn't have authority over @p.
*/
-__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
+__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime,
+ const struct bpf_prog_aux *aux)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = scx_prog_sched(aux);
+ if (unlikely(!scx_task_on_sched(sch, p)))
+ return false;
+
p->scx.dsq_vtime = vtime;
return true;
}
@@ -6456,7 +8573,7 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
* lead to irq_work_queue() malfunction such as infinite busy wait for
* IRQ status update. Suppress kicking.
*/
- if (scx_rq_bypassing(this_rq))
+ if (scx_bypassing(sch, cpu_of(this_rq)))
goto out;
/*
@@ -6496,18 +8613,19 @@ out:
* scx_bpf_kick_cpu - Trigger reschedule on a CPU
* @cpu: cpu to kick
* @flags: %SCX_KICK_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
* trigger rescheduling on a busy CPU. This can be called from any online
* scx_ops operation and the actual kicking is performed asynchronously through
* an irq work.
*/
-__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (likely(sch))
scx_kick_cpu(sch, cpu, flags);
}
@@ -6581,13 +8699,14 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
* @it: iterator to initialize
* @dsq_id: DSQ to iterate
* @flags: %SCX_DSQ_ITER_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Initialize BPF iterator @it which can be used with bpf_for_each() to walk
* tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
* tasks which are already queued when this function is invoked.
*/
__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
- u64 flags)
+ u64 flags, const struct bpf_prog_aux *aux)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
struct scx_sched *sch;
@@ -6605,7 +8724,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
*/
kit->dsq = NULL;
- sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held());
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return -ENODEV;
@@ -6616,8 +8735,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
if (!kit->dsq)
return -ENOENT;
- kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags,
- READ_ONCE(kit->dsq->seq));
+ kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags);
return 0;
}
@@ -6631,41 +8749,13 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
- bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
- struct task_struct *p;
- unsigned long flags;
if (!kit->dsq)
return NULL;
- raw_spin_lock_irqsave(&kit->dsq->lock, flags);
-
- if (list_empty(&kit->cursor.node))
- p = NULL;
- else
- p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
+ guard(raw_spinlock_irqsave)(&kit->dsq->lock);
- /*
- * Only tasks which were queued before the iteration started are
- * visible. This bounds BPF iterations and guarantees that vtime never
- * jumps in the other direction while iterating.
- */
- do {
- p = nldsq_next_task(kit->dsq, p, rev);
- } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
-
- if (p) {
- if (rev)
- list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
- else
- list_move(&kit->cursor.node, &p->scx.dsq_list.node);
- } else {
- list_del_init(&kit->cursor.node);
- }
-
- raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
-
- return p;
+ return nldsq_cursor_next_task(&kit->cursor, kit->dsq);
}
/**
@@ -6694,6 +8784,7 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
/**
* scx_bpf_dsq_peek - Lockless peek at the first element.
* @dsq_id: DSQ to examine.
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Read the first element in the DSQ. This is semantically equivalent to using
* the DSQ iterator, but is lockfree. Of course, like any lockless operation,
@@ -6702,12 +8793,13 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
*
* Returns the pointer, or NULL indicates an empty queue OR internal error.
*/
-__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
+__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id,
+ const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
struct scx_dispatch_q *dsq;
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return NULL;
@@ -6725,6 +8817,62 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
return rcu_dereference(dsq->first_task);
}
+/**
+ * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ
+ * @dsq_id: DSQ to re-enqueue
+ * @reenq_flags: %SCX_RENQ_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Iterate over all of the tasks currently enqueued on the DSQ identified by
+ * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are
+ * supported:
+ *
+ * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu)
+ * - User DSQs
+ *
+ * Re-enqueues are performed asynchronously. Can be called from anywhere.
+ */
+__bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags,
+ const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ struct scx_dispatch_q *dsq;
+
+ guard(preempt)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return;
+
+ if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) {
+ scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags);
+ return;
+ }
+
+ /* not specifying any filter bits is the same as %SCX_REENQ_ANY */
+ if (!(reenq_flags & __SCX_REENQ_FILTER_MASK))
+ reenq_flags |= SCX_REENQ_ANY;
+
+ dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id());
+ schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq());
+}
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
+ * anywhere.
+ *
+ * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the
+ * future.
+ */
+__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
+{
+ scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux);
+}
+
__bpf_kfunc_end_defs();
static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
@@ -6779,18 +8927,20 @@ __bpf_kfunc_start_defs();
* @fmt: error message format string
* @data: format string parameters packaged using ___bpf_fill() macro
* @data__sz: @data len, must end in '__sz' for the verifier
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
* disabling.
*/
__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
- unsigned long long *data, u32 data__sz)
+ unsigned long long *data, u32 data__sz,
+ const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- sch = rcu_dereference_bh(scx_root);
+ sch = scx_prog_sched(aux);
if (likely(sch) &&
bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
@@ -6802,18 +8952,19 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
* @fmt: error message format string
* @data: format string parameters packaged using ___bpf_fill() macro
* @data__sz: @data len, must end in '__sz' for the verifier
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Indicate that the BPF scheduler encountered a fatal error and initiate ops
* disabling.
*/
__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
- u32 data__sz)
+ u32 data__sz, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- sch = rcu_dereference_bh(scx_root);
+ sch = scx_prog_sched(aux);
if (likely(sch) &&
bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
@@ -6825,6 +8976,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
* @fmt: format string
* @data: format string parameters packaged using ___bpf_fill() macro
* @data__sz: @data len, must end in '__sz' for the verifier
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
* dump_task() to generate extra debug dump specific to the BPF scheduler.
@@ -6833,7 +8985,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
* multiple calls. The last line is automatically terminated.
*/
__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
- u32 data__sz)
+ u32 data__sz, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
struct scx_dump_data *dd = &scx_dump_data;
@@ -6842,7 +8994,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return;
@@ -6879,38 +9031,21 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
}
/**
- * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
- *
- * Iterate over all of the tasks currently enqueued on the local DSQ of the
- * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
- * anywhere.
- */
-__bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
-{
- struct rq *rq;
-
- guard(preempt)();
-
- rq = this_rq();
- local_set(&rq->scx.reenq_local_deferred, 1);
- schedule_deferred(rq);
-}
-
-/**
* scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
* @cpu: CPU of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Return the maximum relative capacity of @cpu in relation to the most
* performant CPU in the system. The return value is in the range [1,
* %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
*/
-__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
@@ -6920,6 +9055,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
/**
* scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
* @cpu: CPU of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Return the current relative performance of @cpu in relation to its maximum.
* The return value is in the range [1, %SCX_CPUPERF_ONE].
@@ -6931,13 +9067,13 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*
* The result is in the range [1, %SCX_CPUPERF_ONE].
*/
-__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
@@ -6948,6 +9084,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* scx_bpf_cpuperf_set - Set the relative performance target of a CPU
* @cpu: CPU of interest
* @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Set the target performance level of @cpu to @perf. @perf is in linear
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
@@ -6958,13 +9095,13 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* use. Consult hardware and cpufreq documentation for more information. The
* current performance level can be monitored using scx_bpf_cpuperf_cur().
*/
-__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
+__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return;
@@ -7074,14 +9211,15 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
/**
* scx_bpf_cpu_rq - Fetch the rq of a CPU
* @cpu: CPU of the rq
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*/
-__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
+__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return NULL;
@@ -7100,18 +9238,19 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
/**
* scx_bpf_locked_rq - Return the rq currently locked by SCX
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Returns the rq if a rq lock is currently held by SCX.
* Otherwise emits an error and returns NULL.
*/
-__bpf_kfunc struct rq *scx_bpf_locked_rq(void)
+__bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
struct rq *rq;
guard(preempt)();
- sch = rcu_dereference_sched(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return NULL;
@@ -7127,16 +9266,17 @@ __bpf_kfunc struct rq *scx_bpf_locked_rq(void)
/**
* scx_bpf_cpu_curr - Return remote CPU's curr task
* @cpu: CPU of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Callers must hold RCU read lock (KF_RCU).
*/
-__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
+__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return NULL;
@@ -7147,41 +9287,6 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
}
/**
- * scx_bpf_task_cgroup - Return the sched cgroup of a task
- * @p: task of interest
- *
- * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
- * from the scheduler's POV. SCX operations should use this function to
- * determine @p's current cgroup as, unlike following @p->cgroups,
- * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
- * rq-locked operations. Can be called on the parameter tasks of rq-locked
- * operations. The restriction guarantees that @p's rq is locked by the caller.
- */
-#ifdef CONFIG_CGROUP_SCHED
-__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
-{
- struct task_group *tg = p->sched_task_group;
- struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
- struct scx_sched *sch;
-
- guard(rcu)();
-
- sch = rcu_dereference(scx_root);
- if (unlikely(!sch))
- goto out;
-
- if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
- goto out;
-
- cgrp = tg_cgrp(tg);
-
-out:
- cgroup_get(cgrp);
- return cgrp;
-}
-#endif
-
-/**
* scx_bpf_now - Returns a high-performance monotonically non-decreasing
* clock for the current CPU. The clock returned is in nanoseconds.
*
@@ -7257,10 +9362,14 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED);
+ scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT);
scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED);
+ scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH);
}
}
@@ -7294,25 +9403,62 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
memcpy(events, &e_sys, events__sz);
}
+#ifdef CONFIG_CGROUP_SCHED
+/**
+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
+ * @p: task of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
+ * from the scheduler's POV. SCX operations should use this function to
+ * determine @p's current cgroup as, unlike following @p->cgroups,
+ * @p->sched_task_group is stable for the duration of the SCX op. See
+ * SCX_CALL_OP_TASK() for details.
+ */
+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p,
+ const struct bpf_prog_aux *aux)
+{
+ struct task_group *tg = p->sched_task_group;
+ struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ goto out;
+
+ if (!scx_kf_arg_task_ok(sch, p))
+ goto out;
+
+ cgrp = tg_cgrp(tg);
+
+out:
+ cgroup_get(cgrp);
+ return cgrp;
+}
+#endif /* CONFIG_CGROUP_SCHED */
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
-BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU);
-BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU);
-BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
+BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU);
+BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU);
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
-BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, scx_bpf_exit_bstr)
-BTF_ID_FLAGS(func, scx_bpf_error_bstr)
-BTF_ID_FLAGS(func, scx_bpf_dump_bstr)
-BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
-BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
-BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
-BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
@@ -7320,14 +9466,14 @@ BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
-BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
-BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
-#ifdef CONFIG_CGROUP_SCHED
-BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
-#endif
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, scx_bpf_now)
BTF_ID_FLAGS(func, scx_bpf_events)
+#ifdef CONFIG_CGROUP_SCHED
+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE)
+#endif
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
@@ -7335,6 +9481,115 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = {
.set = &scx_kfunc_ids_any,
};
+/*
+ * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc
+ * group; an op may permit zero or more groups, with the union expressed in
+ * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter())
+ * consults this table to decide whether a context-sensitive kfunc is callable
+ * from a given SCX op.
+ */
+enum scx_kf_allow_flags {
+ SCX_KF_ALLOW_UNLOCKED = 1 << 0,
+ SCX_KF_ALLOW_CPU_RELEASE = 1 << 1,
+ SCX_KF_ALLOW_DISPATCH = 1 << 2,
+ SCX_KF_ALLOW_ENQUEUE = 1 << 3,
+ SCX_KF_ALLOW_SELECT_CPU = 1 << 4,
+};
+
+/*
+ * Map each SCX op to the union of kfunc groups it permits, indexed by
+ * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not
+ * context-sensitive.
+ */
+static const u32 scx_kf_allow_flags[] = {
+ [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE,
+ [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE,
+ [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH,
+ [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE,
+ [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED,
+#ifdef CONFIG_EXT_GROUP_SCHED
+ [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED,
+#endif /* CONFIG_EXT_GROUP_SCHED */
+ [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED,
+};
+
+/*
+ * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the
+ * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this
+ * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
+ * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the
+ * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g.
+ * scx_kfunc_ids_any) by falling through to "allow" when none of the
+ * context-sensitive sets contain the kfunc.
+ */
+int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id);
+ bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id);
+ bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id);
+ bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id);
+ bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id);
+ u32 moff, flags;
+
+ /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */
+ if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release))
+ return 0;
+
+ /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */
+ if (prog->type == BPF_PROG_TYPE_SYSCALL)
+ return (in_unlocked || in_select_cpu) ? 0 : -EACCES;
+
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+ return -EACCES;
+
+ /*
+ * add_subprog_and_kfunc() collects all kfunc calls, including dead code
+ * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets
+ * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set;
+ * do_check_main() re-runs the filter with st_ops set and enforces the
+ * actual restrictions.
+ */
+ if (!prog->aux->st_ops)
+ return 0;
+
+ /*
+ * Non-SCX struct_ops: only unlocked kfuncs are safe. The other
+ * context-sensitive kfuncs assume the rq lock is held by the SCX
+ * dispatch path, which doesn't apply to other struct_ops users.
+ */
+ if (prog->aux->st_ops != &bpf_sched_ext_ops)
+ return in_unlocked ? 0 : -EACCES;
+
+ /* SCX struct_ops: check the per-op allow list. */
+ moff = prog->aux->attach_st_ops_member_off;
+ flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)];
+
+ if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked)
+ return 0;
+ if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release)
+ return 0;
+ if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch)
+ return 0;
+ if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue)
+ return 0;
+ if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu)
+ return 0;
+
+ return -EACCES;
+}
+
static int __init scx_init(void)
{
int ret;
@@ -7344,11 +9599,12 @@ static int __init scx_init(void)
* register_btf_kfunc_id_set() needs most of the system to be up.
*
* Some kfuncs are context-sensitive and can only be called from
- * specific SCX ops. They are grouped into BTF sets accordingly.
- * Unfortunately, BPF currently doesn't have a way of enforcing such
- * restrictions. Eventually, the verifier should be able to enforce
- * them. For now, register them the same and make each kfunc explicitly
- * check using scx_kf_allowed().
+ * specific SCX ops. They are grouped into per-context BTF sets, each
+ * registered with scx_kfunc_context_filter as its .filter callback. The
+ * BPF core dedups identical filter pointers per hook
+ * (btf_populate_kfunc_set()), so the filter is invoked exactly once per
+ * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op
+ * restrictions at verify time.
*/
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_enqueue_dispatch)) ||
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 43429b33e52c..0b7fc46aee08 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -11,7 +11,7 @@
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
-int scx_fork(struct task_struct *p);
+int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs);
void scx_post_fork(struct task_struct *p);
void scx_cancel_fork(struct task_struct *p);
bool scx_can_stop_tick(struct rq *rq);
@@ -44,7 +44,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
static inline void scx_tick(struct rq *rq) {}
static inline void scx_pre_fork(struct task_struct *p) {}
-static inline int scx_fork(struct task_struct *p) { return 0; }
+static inline int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; }
static inline void scx_post_fork(struct task_struct *p) {}
static inline void scx_cancel_fork(struct task_struct *p) {}
static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index c5a3b0bac7c3..443d12a3df67 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -368,7 +368,7 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
/*
* Enable NUMA optimization only when there are multiple NUMA domains
- * among the online CPUs and the NUMA domains don't perfectly overlaps
+ * among the online CPUs and the NUMA domains don't perfectly overlap
* with the LLC domains.
*
* If all CPUs belong to the same NUMA node and the same LLC domain,
@@ -424,18 +424,24 @@ static inline bool task_affinity_all(const struct task_struct *p)
* - prefer the last used CPU to take advantage of cached data (L1, L2) and
* branch prediction optimizations.
*
- * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * 3. Prefer @prev_cpu's SMT sibling:
+ * - if @prev_cpu is busy and no fully idle core is available, try to
+ * place the task on an idle SMT sibling of @prev_cpu; keeping the
+ * task on the same core makes migration cheaper, preserves L1 cache
+ * locality and reduces wakeup latency.
+ *
+ * 4. Pick a CPU within the same LLC (Last-Level Cache):
* - if the above conditions aren't met, pick a CPU that shares the same
* LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
* cache locality.
*
- * 4. Pick a CPU within the same NUMA node, if enabled:
+ * 5. Pick a CPU within the same NUMA node, if enabled:
* - choose a CPU from the same NUMA node, if the node cpumask is a
* subset of @cpus_allowed, to reduce memory access latency.
*
- * 5. Pick any idle CPU within the @cpus_allowed domain.
+ * 6. Pick any idle CPU within the @cpus_allowed domain.
*
- * Step 3 and 4 are performed only if the system has, respectively,
+ * Step 4 and 5 are performed only if the system has, respectively,
* multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
* scx_selcpu_topo_numa) and they don't contain the same subset of CPUs.
*
@@ -543,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
* piled up on it even if there is an idle core elsewhere on
* the system.
*/
- waker_node = cpu_to_node(cpu);
+ waker_node = scx_cpu_node_if_enabled(cpu);
if (!(current->flags & PF_EXITING) &&
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
@@ -616,6 +622,20 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
goto out_unlock;
}
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * Use @prev_cpu's sibling if it's idle.
+ */
+ if (sched_smt_active()) {
+ for_each_cpu_and(cpu, cpu_smt_mask(prev_cpu), allowed) {
+ if (cpu == prev_cpu)
+ continue;
+ if (scx_idle_test_and_clear_cpu(cpu))
+ goto out_unlock;
+ }
+ }
+#endif
+
/*
* Search for any idle CPU in the same LLC domain.
*/
@@ -663,9 +683,8 @@ void scx_idle_init_masks(void)
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
- /* Allocate per-node idle cpumasks */
- scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks,
- num_possible_nodes());
+ /* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */
+ scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids);
BUG_ON(!scx_idle_node_masks);
for_each_node(i) {
@@ -768,8 +787,9 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
* either enqueue() sees the idle bit or update_idle() sees the task
* that enqueue() queued.
*/
- if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
+ if (SCX_HAS_OP(sch, update_idle) && do_notify &&
+ !scx_bypassing(sch, cpu_of(rq)))
+ SCX_CALL_OP(sch, update_idle, rq, cpu_of(rq), idle);
}
static void reset_idle_masks(struct sched_ext_ops *ops)
@@ -861,33 +881,40 @@ static bool check_builtin_idle_enabled(struct scx_sched *sch)
* code.
*
* We can't simply check whether @p->migration_disabled is set in a
- * sched_ext callback, because migration is always disabled for the current
- * task while running BPF code.
+ * sched_ext callback, because the BPF prolog (__bpf_prog_enter) may disable
+ * migration for the current task while running BPF code.
+ *
+ * Since the BPF prolog calls migrate_disable() only when CONFIG_PREEMPT_RCU
+ * is enabled (via rcu_read_lock_dont_migrate()), migration_disabled == 1 for
+ * the current task is ambiguous only in that case: it could be from the BPF
+ * prolog rather than a real migrate_disable() call.
*
- * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively
- * disable and re-enable migration. For this reason, the current task
- * inside a sched_ext callback is always a migration-disabled task.
+ * Without CONFIG_PREEMPT_RCU, the BPF prolog never calls migrate_disable(),
+ * so migration_disabled == 1 always means the task is truly
+ * migration-disabled.
*
- * Therefore, when @p->migration_disabled == 1, check whether @p is the
- * current task or not: if it is, then migration was not disabled before
- * entering the callback, otherwise migration was disabled.
+ * Therefore, when migration_disabled == 1 and CONFIG_PREEMPT_RCU is enabled,
+ * check whether @p is the current task or not: if it is, then migration was
+ * not disabled before entering the callback, otherwise migration was disabled.
*
* Returns true if @p is migration-disabled, false otherwise.
*/
static bool is_bpf_migration_disabled(const struct task_struct *p)
{
- if (p->migration_disabled == 1)
- return p != current;
- else
- return p->migration_disabled;
+ if (p->migration_disabled == 1) {
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU))
+ return p != current;
+ return true;
+ }
+ return p->migration_disabled;
}
static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
s32 prev_cpu, u64 wake_flags,
const struct cpumask *allowed, u64 flags)
{
- struct rq *rq;
- struct rq_flags rf;
+ unsigned long irq_flags;
+ bool we_locked = false;
s32 cpu;
if (!ops_cpu_valid(sch, prev_cpu, NULL))
@@ -897,27 +924,20 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
return -EBUSY;
/*
- * If called from an unlocked context, acquire the task's rq lock,
- * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed.
+ * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq
+ * lock or @p's pi_lock. Three cases:
*
- * Otherwise, allow to use this kfunc only from ops.select_cpu()
- * and ops.select_enqueue().
- */
- if (scx_kf_allowed_if_unlocked()) {
- rq = task_rq_lock(p, &rf);
- } else {
- if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
- return -EPERM;
- rq = scx_locked_rq();
- }
-
- /*
- * Validate locking correctness to access p->cpus_ptr and
- * p->nr_cpus_allowed: if we're holding an rq lock, we're safe;
- * otherwise, assert that p->pi_lock is held.
+ * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock.
+ * - other rq-locked SCX op: scx_locked_rq() points at the held rq.
+ * - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops):
+ * nothing held, take pi_lock ourselves.
*/
- if (!rq)
+ if (this_rq()->scx.in_select_cpu) {
lockdep_assert_held(&p->pi_lock);
+ } else if (!scx_locked_rq()) {
+ raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
+ we_locked = true;
+ }
/*
* This may also be called from ops.enqueue(), so we need to handle
@@ -936,8 +956,8 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
allowed ?: p->cpus_ptr, flags);
}
- if (scx_kf_allowed_if_unlocked())
- task_rq_unlock(rq, p, &rf);
+ if (we_locked)
+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);
return cpu;
}
@@ -946,14 +966,15 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
* scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
* trigger an error if @cpu is invalid
* @cpu: target CPU
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*/
-__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
+__bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
@@ -965,6 +986,7 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
* @prev_cpu: CPU @p was on previously
* @wake_flags: %SCX_WAKE_* flags
* @is_idle: out parameter indicating whether the returned CPU is idle
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
* context such as a BPF test_run() call, as long as built-in CPU selection
@@ -975,14 +997,15 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
* currently idle and thus a good candidate for direct dispatching.
*/
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *is_idle)
+ u64 wake_flags, bool *is_idle,
+ const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
s32 cpu;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return -ENODEV;
@@ -1010,6 +1033,7 @@ struct scx_bpf_select_cpu_and_args {
* @args->prev_cpu: CPU @p was on previously
* @args->wake_flags: %SCX_WAKE_* flags
* @args->flags: %SCX_PICK_IDLE* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
* limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
@@ -1028,13 +1052,14 @@ struct scx_bpf_select_cpu_and_args {
*/
__bpf_kfunc s32
__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
- struct scx_bpf_select_cpu_and_args *args)
+ struct scx_bpf_select_cpu_and_args *args,
+ const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return -ENODEV;
@@ -1056,6 +1081,17 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
if (unlikely(!sch))
return -ENODEV;
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * Disallow if any sub-scheds are attached. There is no way to tell
+ * which scheduler called us, just error out @p's scheduler.
+ */
+ if (unlikely(!list_empty(&sch->children))) {
+ scx_error(scx_task_sched(p), "__scx_bpf_select_cpu_and() must be used");
+ return -EINVAL;
+ }
+#endif
+
return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
cpus_allowed, flags);
}
@@ -1064,18 +1100,20 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
* scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
* idle-tracking per-CPU cpumask of a target NUMA node.
* @node: target NUMA node
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
* be reported to the BPF scheduler via scx_error().
*/
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
+__bpf_kfunc const struct cpumask *
+scx_bpf_get_idle_cpumask_node(s32 node, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return cpu_none_mask;
@@ -1089,17 +1127,18 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
/**
* scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
* per-CPU cpumask.
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Returns an empty mask if idle tracking is not enabled, or running on a
* UP kernel.
*/
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return cpu_none_mask;
@@ -1119,18 +1158,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
* idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
* used to determine if an entire physical core is free.
* @node: target NUMA node
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
* be reported to the BPF scheduler via scx_error().
*/
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
+__bpf_kfunc const struct cpumask *
+scx_bpf_get_idle_smtmask_node(s32 node, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return cpu_none_mask;
@@ -1148,17 +1189,18 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
* scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
* per-physical-core cpumask. Can be used to determine if an entire physical
* core is free.
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Returns an empty mask if idle tracking is not enabled, or running on a
* UP kernel.
*/
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return cpu_none_mask;
@@ -1194,6 +1236,7 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
/**
* scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
* @cpu: cpu to test and clear idle for
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Returns %true if @cpu was idle and its idle state was successfully cleared.
* %false otherwise.
@@ -1201,13 +1244,13 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
* Unavailable if ops.update_idle() is implemented and
* %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
*/
-__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return false;
@@ -1225,6 +1268,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
* @cpus_allowed: Allowed cpumask
* @node: target NUMA node
* @flags: %SCX_PICK_IDLE_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
*
@@ -1240,13 +1284,14 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
* %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set.
*/
__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
- int node, u64 flags)
+ s32 node, u64 flags,
+ const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return -ENODEV;
@@ -1261,6 +1306,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
* scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
* @cpus_allowed: Allowed cpumask
* @flags: %SCX_PICK_IDLE_CPU_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
* number on success. -%EBUSY if no matching cpu was found.
@@ -1280,13 +1326,13 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
* scx_bpf_pick_idle_cpu_node() instead.
*/
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
+ u64 flags, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return -ENODEV;
@@ -1307,6 +1353,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
* @cpus_allowed: Allowed cpumask
* @node: target NUMA node
* @flags: %SCX_PICK_IDLE_CPU_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
* CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
@@ -1323,14 +1370,15 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
* CPU.
*/
__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
- int node, u64 flags)
+ s32 node, u64 flags,
+ const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
s32 cpu;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return -ENODEV;
@@ -1356,6 +1404,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
* scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
* @cpus_allowed: Allowed cpumask
* @flags: %SCX_PICK_IDLE_CPU_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
*
* Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
* CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
@@ -1370,14 +1419,14 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
* scx_bpf_pick_any_cpu_node() instead.
*/
__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
+ u64 flags, const struct bpf_prog_aux *aux)
{
struct scx_sched *sch;
s32 cpu;
guard(rcu)();
- sch = rcu_dereference(scx_root);
+ sch = scx_prog_sched(aux);
if (unlikely(!sch))
return -ENODEV;
@@ -1402,20 +1451,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_idle)
-BTF_ID_FLAGS(func, scx_bpf_cpu_node)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
-BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
-BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
@@ -1423,13 +1469,38 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
.set = &scx_kfunc_ids_idle,
};
+/*
+ * The select_cpu kfuncs internally call task_rq_lock() when invoked from an
+ * rq-unlocked context, and thus cannot be safely called from arbitrary tracing
+ * contexts where @p's pi_lock state is unknown. Keep them out of
+ * BPF_PROG_TYPE_TRACING by registering them in their own set which is exposed
+ * only to STRUCT_OPS and SYSCALL programs.
+ *
+ * These kfuncs are also members of scx_kfunc_ids_unlocked (see ext.c) because
+ * they're callable from unlocked contexts in addition to ops.select_cpu() and
+ * ops.enqueue().
+ */
+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_select_cpu,
+ .filter = scx_kfunc_context_filter,
+};
+
int scx_idle_init(void)
{
int ret;
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu);
return ret;
}
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index fa583f141f35..dc35f850481e 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,6 +12,8 @@
struct sched_ext_ops;
+extern struct btf_id_set8 scx_kfunc_ids_select_cpu;
+
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
void scx_idle_init_masks(void);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 386c677e4c9a..62ce4eaf6a3f 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -6,6 +6,7 @@
* Copyright (c) 2025 Tejun Heo <tj@kernel.org>
*/
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+#define SCX_MOFF_IDX(moff) ((moff) / sizeof(void (*)(void)))
enum scx_consts {
SCX_DSP_DFL_MAX_BATCH = 32,
@@ -24,10 +25,16 @@ enum scx_consts {
*/
SCX_TASK_ITER_BATCH = 32,
+ SCX_BYPASS_HOST_NTH = 2,
+
SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
SCX_BYPASS_LB_DONOR_PCT = 125,
SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
SCX_BYPASS_LB_BATCH = 256,
+
+ SCX_REENQ_LOCAL_MAX_REPEAT = 256,
+
+ SCX_SUB_MAX_DEPTH = 4,
};
enum scx_exit_kind {
@@ -38,6 +45,7 @@ enum scx_exit_kind {
SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+ SCX_EXIT_PARENT, /* parent exiting */
SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
@@ -62,6 +70,7 @@ enum scx_exit_kind {
enum scx_exit_code {
/* Reasons */
SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+ SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32,
/* Actions */
SCX_ECODE_ACT_RESTART = 1LLU << 48,
@@ -74,7 +83,7 @@ enum scx_exit_flags {
* info communication. The following flag indicates whether ops.init()
* finished successfully.
*/
- SCX_EFLAG_INITIALIZED,
+ SCX_EFLAG_INITIALIZED = 1LLU << 0,
};
/*
@@ -175,9 +184,10 @@ enum scx_ops_flags {
SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
/*
- * CPU cgroup support flags
+ * If set, %SCX_ENQ_IMMED is assumed to be set on all local DSQ
+ * enqueues.
*/
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+ SCX_OPS_ALWAYS_ENQ_IMMED = 1LLU << 7,
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
@@ -186,7 +196,7 @@ enum scx_ops_flags {
SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
+ SCX_OPS_ALWAYS_ENQ_IMMED,
/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
__SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
@@ -213,7 +223,7 @@ struct scx_exit_task_args {
bool cancelled;
};
-/* argument container for ops->cgroup_init() */
+/* argument container for ops.cgroup_init() */
struct scx_cgroup_init_args {
/* the weight of the cgroup [1..10000] */
u32 weight;
@@ -236,12 +246,12 @@ enum scx_cpu_preempt_reason {
};
/*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * Argument container for ops.cpu_acquire(). Currently empty, but may be
* expanded in the future.
*/
struct scx_cpu_acquire_args {};
-/* argument container for ops->cpu_release() */
+/* argument container for ops.cpu_release() */
struct scx_cpu_release_args {
/* the reason the CPU was preempted */
enum scx_cpu_preempt_reason reason;
@@ -250,9 +260,7 @@ struct scx_cpu_release_args {
struct task_struct *task;
};
-/*
- * Informational context provided to dump operations.
- */
+/* informational context provided to dump operations */
struct scx_dump_ctx {
enum scx_exit_kind kind;
s64 exit_code;
@@ -261,6 +269,18 @@ struct scx_dump_ctx {
u64 at_jiffies;
};
+/* argument container for ops.sub_attach() */
+struct scx_sub_attach_args {
+ struct sched_ext_ops *ops;
+ char *cgroup_path;
+};
+
+/* argument container for ops.sub_detach() */
+struct scx_sub_detach_args {
+ struct sched_ext_ops *ops;
+ char *cgroup_path;
+};
+
/**
* struct sched_ext_ops - Operation table for BPF scheduler implementation
*
@@ -721,6 +741,20 @@ struct sched_ext_ops {
#endif /* CONFIG_EXT_GROUP_SCHED */
+ /**
+ * @sub_attach: Attach a sub-scheduler
+ * @args: argument container, see the struct definition
+ *
+ * Return 0 to accept the sub-scheduler. -errno to reject.
+ */
+ s32 (*sub_attach)(struct scx_sub_attach_args *args);
+
+ /**
+ * @sub_detach: Detach a sub-scheduler
+ * @args: argument container, see the struct definition
+ */
+ void (*sub_detach)(struct scx_sub_detach_args *args);
+
/*
* All online ops must come before ops.cpu_online().
*/
@@ -762,6 +796,10 @@ struct sched_ext_ops {
*/
void (*exit)(struct scx_exit_info *info);
+ /*
+ * Data fields must comes after all ops fields.
+ */
+
/**
* @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
*/
@@ -797,6 +835,12 @@ struct sched_ext_ops {
u64 hotplug_seq;
/**
+ * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the
+ * specified cgroup.
+ */
+ u64 sub_cgroup_id;
+
+ /**
* @name: BPF scheduler's name
*
* Must be a non-zero valid BPF object name including only isalnum(),
@@ -806,7 +850,7 @@ struct sched_ext_ops {
char name[SCX_OPS_NAME_LEN];
/* internal use only, must be NULL */
- void *priv;
+ void __rcu *priv;
};
enum scx_opi {
@@ -854,6 +898,24 @@ struct scx_event_stats {
s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
/*
+ * The number of times a task, enqueued on a local DSQ with
+ * SCX_ENQ_IMMED, was re-enqueued because the CPU was not available for
+ * immediate execution.
+ */
+ s64 SCX_EV_REENQ_IMMED;
+
+ /*
+ * The number of times a reenq of local DSQ caused another reenq of
+ * local DSQ. This can happen when %SCX_ENQ_IMMED races against a higher
+ * priority class task even if the BPF scheduler always satisfies the
+ * prerequisites for %SCX_ENQ_IMMED at the time of enqueue. However,
+ * that scenario is very unlikely and this count going up regularly
+ * indicates that the BPF scheduler is handling %SCX_ENQ_REENQ
+ * incorrectly causing recursive reenqueues.
+ */
+ s64 SCX_EV_REENQ_LOCAL_REPEAT;
+
+ /*
* Total number of times a task's time slice was refilled with the
* default value (SCX_SLICE_DFL).
*/
@@ -873,15 +935,77 @@ struct scx_event_stats {
* The number of times the bypassing mode has been activated.
*/
s64 SCX_EV_BYPASS_ACTIVATE;
+
+ /*
+ * The number of times the scheduler attempted to insert a task that it
+ * doesn't own into a DSQ. Such attempts are ignored.
+ *
+ * As BPF schedulers are allowed to ignore dequeues, it's difficult to
+ * tell whether such an attempt is from a scheduler malfunction or an
+ * ignored dequeue around sub-sched enabling. If this count keeps going
+ * up regardless of sub-sched enabling, it likely indicates a bug in the
+ * scheduler.
+ */
+ s64 SCX_EV_INSERT_NOT_OWNED;
+
+ /*
+ * The number of times tasks from bypassing descendants are scheduled
+ * from sub_bypass_dsq's.
+ */
+ s64 SCX_EV_SUB_BYPASS_DISPATCH;
+};
+
+struct scx_sched;
+
+enum scx_sched_pcpu_flags {
+ SCX_SCHED_PCPU_BYPASSING = 1LLU << 0,
+};
+
+/* dispatch buf */
+struct scx_dsp_buf_ent {
+ struct task_struct *task;
+ unsigned long qseq;
+ u64 dsq_id;
+ u64 enq_flags;
+};
+
+struct scx_dsp_ctx {
+ struct rq *rq;
+ u32 cursor;
+ u32 nr_tasks;
+ struct scx_dsp_buf_ent buf[];
+};
+
+struct scx_deferred_reenq_local {
+ struct list_head node;
+ u64 flags;
+ u64 seq;
+ u32 cnt;
};
struct scx_sched_pcpu {
+ struct scx_sched *sch;
+ u64 flags; /* protected by rq lock */
+
/*
* The event counters are in a per-CPU variable to minimize the
* accounting overhead. A system-wide view on the event counter is
* constructed when requested by scx_bpf_events().
*/
struct scx_event_stats event_stats;
+
+ struct scx_deferred_reenq_local deferred_reenq_local;
+ struct scx_dispatch_q bypass_dsq;
+#ifdef CONFIG_EXT_SUB_SCHED
+ u32 bypass_host_seq;
+#endif
+
+ /* must be the last entry - contains flex array */
+ struct scx_dsp_ctx dsp_ctx;
+};
+
+struct scx_sched_pnode {
+ struct scx_dispatch_q global_dsq;
};
struct scx_sched {
@@ -897,15 +1021,50 @@ struct scx_sched {
* per-node split isn't sufficient, it can be further split.
*/
struct rhashtable dsq_hash;
- struct scx_dispatch_q **global_dsqs;
+ struct scx_sched_pnode **pnode;
struct scx_sched_pcpu __percpu *pcpu;
+ u64 slice_dfl;
+ u64 bypass_timestamp;
+ s32 bypass_depth;
+
+ /* bypass dispatch path enable state, see bypass_dsp_enabled() */
+ unsigned long bypass_dsp_claim;
+ atomic_t bypass_dsp_enable_depth;
+
+ bool aborting;
+ bool dump_disabled; /* protected by scx_dump_lock */
+ u32 dsp_max_batch;
+ s32 level;
+
/*
* Updates to the following warned bitfields can race causing RMW issues
* but it doesn't really matter.
*/
bool warned_zero_slice:1;
bool warned_deprecated_rq:1;
+ bool warned_unassoc_progs:1;
+
+ struct list_head all;
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ struct rhash_head hash_node;
+
+ struct list_head children;
+ struct list_head sibling;
+ struct cgroup *cgrp;
+ char *cgrp_path;
+ struct kset *sub_kset;
+
+ bool sub_attached;
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+ /*
+ * The maximum amount of time in jiffies that a task may be runnable
+ * without being scheduled on a CPU. If this timeout is exceeded, it
+ * will trigger scx_error().
+ */
+ unsigned long watchdog_timeout;
atomic_t exit_kind;
struct scx_exit_info *exit_info;
@@ -913,9 +1072,13 @@ struct scx_sched {
struct kobject kobj;
struct kthread_worker *helper;
- struct irq_work error_irq_work;
+ struct irq_work disable_irq_work;
struct kthread_work disable_work;
+ struct timer_list bypass_lb_timer;
struct rcu_work rcu_work;
+
+ /* all ancestors including self */
+ struct scx_sched *ancestors[];
};
enum scx_wake_flags {
@@ -942,13 +1105,27 @@ enum scx_enq_flags {
SCX_ENQ_PREEMPT = 1LLU << 32,
/*
- * The task being enqueued was previously enqueued on the current CPU's
- * %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
- * invoked in a ->cpu_release() callback, and the task is again
- * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
- * task will not be scheduled on the CPU until at least the next invocation
- * of the ->cpu_acquire() callback.
+ * Only allowed on local DSQs. Guarantees that the task either gets
+ * on the CPU immediately and stays on it, or gets reenqueued back
+ * to the BPF scheduler. It will never linger on a local DSQ or be
+ * silently put back after preemption.
+ *
+ * The protection persists until the next fresh enqueue - it
+ * survives SAVE/RESTORE cycles, slice extensions and preemption.
+ * If the task can't stay on the CPU for any reason, it gets
+ * reenqueued back to the BPF scheduler.
+ *
+ * Exiting and migration-disabled tasks bypass ops.enqueue() and
+ * are placed directly on a local DSQ without IMMED protection
+ * unless %SCX_OPS_ENQ_EXITING and %SCX_OPS_ENQ_MIGRATION_DISABLED
+ * are set respectively.
+ */
+ SCX_ENQ_IMMED = 1LLU << 33,
+
+ /*
+ * The task being enqueued was previously enqueued on a DSQ, but was
+ * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find
+ * out why a given task is being reenqueued.
*/
SCX_ENQ_REENQ = 1LLU << 40,
@@ -969,6 +1146,7 @@ enum scx_enq_flags {
SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
SCX_ENQ_NESTED = 1LLU << 58,
+ SCX_ENQ_GDSQ_FALLBACK = 1LLU << 59, /* fell back to global DSQ */
};
enum scx_deq_flags {
@@ -982,6 +1160,28 @@ enum scx_deq_flags {
* it hasn't been dispatched yet. Dequeue from the BPF side.
*/
SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+
+ /*
+ * The task is being dequeued due to a property change (e.g.,
+ * sched_setaffinity(), sched_setscheduler(), set_user_nice(),
+ * etc.).
+ */
+ SCX_DEQ_SCHED_CHANGE = 1LLU << 33,
+};
+
+enum scx_reenq_flags {
+ /* low 16bits determine which tasks should be reenqueued */
+ SCX_REENQ_ANY = 1LLU << 0, /* all tasks */
+
+ __SCX_REENQ_FILTER_MASK = 0xffffLLU,
+
+ __SCX_REENQ_USER_MASK = SCX_REENQ_ANY,
+
+ /* bits 32-35 used by task_should_reenq() */
+ SCX_REENQ_TSR_RQ_OPEN = 1LLU << 32,
+ SCX_REENQ_TSR_NOT_FIRST = 1LLU << 33,
+
+ __SCX_REENQ_TSR_MASK = 0xfLLU << 32,
};
enum scx_pick_idle_cpu_flags {
@@ -1035,26 +1235,108 @@ static const char *scx_enable_state_str[] = {
};
/*
- * sched_ext_entity->ops_state
+ * Task Ownership State Machine (sched_ext_entity->ops_state)
*
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
+ * The sched_ext core uses this state machine to track task ownership
+ * between the SCX core and the BPF scheduler. This allows the BPF
+ * scheduler to dispatch tasks without strict ordering requirements, while
+ * the SCX core safely rejects invalid dispatches.
*
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- * ^ | |
- * | v v
- * \-------------------------------/
+ * State Transitions
*
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
+ * .------------> NONE (owned by SCX core)
+ * | | ^
+ * | enqueue | | direct dispatch
+ * | v |
+ * | QUEUEING -------'
+ * | |
+ * | enqueue |
+ * | completes |
+ * | v
+ * | QUEUED (owned by BPF scheduler)
+ * | |
+ * | dispatch |
+ * | |
+ * | v
+ * | DISPATCHING
+ * | |
+ * | dispatch |
+ * | completes |
+ * `---------------'
*
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
+ * State Descriptions
+ *
+ * - %SCX_OPSS_NONE:
+ * Task is owned by the SCX core. It's either on a run queue, running,
+ * or being manipulated by the core scheduler. The BPF scheduler has no
+ * claim on this task.
+ *
+ * - %SCX_OPSS_QUEUEING:
+ * Transitional state while transferring a task from the SCX core to
+ * the BPF scheduler. The task's rq lock is held during this state.
+ * Since QUEUEING is both entered and exited under the rq lock, dequeue
+ * can never observe this state (it would be a BUG). When finishing a
+ * dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion
+ * path busy-waits for it to leave this state (via wait_ops_state())
+ * before retrying.
+ *
+ * - %SCX_OPSS_QUEUED:
+ * Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue)
+ * and the BPF scheduler is responsible for dispatching it. A QSEQ
+ * (queue sequence number) is embedded in this state to detect
+ * dispatch/dequeue races: if a task is dequeued and re-enqueued, the
+ * QSEQ changes and any in-flight dispatch operations targeting the old
+ * QSEQ are safely ignored.
+ *
+ * - %SCX_OPSS_DISPATCHING:
+ * Transitional state while transferring a task from the BPF scheduler
+ * back to the SCX core. This state indicates the BPF scheduler has
+ * selected the task for execution. When dequeue needs to take the task
+ * off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path
+ * busy-waits for it to leave this state (via wait_ops_state()) before
+ * proceeding. Exits to %SCX_OPSS_NONE when dispatch completes.
+ *
+ * Memory Ordering
+ *
+ * Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into
+ * %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release()
+ * and waiters must use atomic_long_read_acquire(). This ensures proper
+ * synchronization between concurrent operations.
+ *
+ * Cross-CPU Task Migration
+ *
+ * When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply
+ * grab the target CPU's rq lock because a concurrent dequeue might be
+ * waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock
+ * (deadlock).
+ *
+ * The sched_ext core uses a "lock dancing" protocol coordinated by
+ * p->scx.holding_cpu. When moving a task to a different rq:
+ *
+ * 1. Verify task can be moved (CPU affinity, migration_disabled, etc.)
+ * 2. Set p->scx.holding_cpu to the current CPU
+ * 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING
+ * is set, so clearing DISPATCHING first prevents the circular wait
+ * (safe to lock the rq we need)
+ * 4. Unlock the current CPU's rq
+ * 5. Lock src_rq (where the task currently lives)
+ * 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the
+ * race (dequeue clears holding_cpu to -1 when it takes the task), in
+ * this case migration is aborted
+ * 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly
+ * into dst_rq's local DSQ (no lock swap needed)
+ * 8. Otherwise: call move_remote_task_to_local_dsq(), which releases
+ * src_rq, locks dst_rq, and performs the deactivate/activate
+ * migration cycle (dst_rq is held on return)
+ * 9. Unlock dst_rq and re-lock the current CPU's rq to restore
+ * the lock state expected by the caller
+ *
+ * If any verification fails, abort the migration.
+ *
+ * This state tracking allows the BPF scheduler to try to dispatch any task
+ * at any time regardless of its state. The SCX core can safely
+ * reject/ignore invalid dispatches, simplifying the BPF scheduler
+ * implementation.
*/
enum scx_ops_state {
SCX_OPSS_NONE, /* owned by the SCX core */
@@ -1079,8 +1361,11 @@ enum scx_ops_state {
#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+extern struct scx_sched __rcu *scx_root;
DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id);
+
/*
* Return the rq currently locked from an scx callback, or NULL if no rq is
* locked.
@@ -1090,12 +1375,107 @@ static inline struct rq *scx_locked_rq(void)
return __this_cpu_read(scx_locked_rq_state);
}
-static inline bool scx_kf_allowed_if_unlocked(void)
+static inline bool scx_bypassing(struct scx_sched *sch, s32 cpu)
+{
+ return unlikely(per_cpu_ptr(sch->pcpu, cpu)->flags &
+ SCX_SCHED_PCPU_BYPASSING);
+}
+
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_task_sched - Find scx_sched scheduling a task
+ * @p: task of interest
+ *
+ * Return @p's scheduler instance. Must be called with @p's pi_lock or rq lock
+ * held.
+ */
+static inline struct scx_sched *scx_task_sched(const struct task_struct *p)
+{
+ return rcu_dereference_protected(p->scx.sched,
+ lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(__rq_lockp(task_rq(p))));
+}
+
+/**
+ * scx_task_sched_rcu - Find scx_sched scheduling a task
+ * @p: task of interest
+ *
+ * Return @p's scheduler instance. The returned scx_sched is RCU protected.
+ */
+static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p)
+{
+ return rcu_dereference_all(p->scx.sched);
+}
+
+/**
+ * scx_task_on_sched - Is a task on the specified sched?
+ * @sch: sched to test against
+ * @p: task of interest
+ *
+ * Returns %true if @p is on @sch, %false otherwise.
+ */
+static inline bool scx_task_on_sched(struct scx_sched *sch,
+ const struct task_struct *p)
+{
+ return rcu_access_pointer(p->scx.sched) == sch;
+}
+
+/**
+ * scx_prog_sched - Find scx_sched associated with a BPF prog
+ * @aux: aux passed in from BPF to a kfunc
+ *
+ * To be called from kfuncs. Return the scheduler instance associated with the
+ * BPF program given the implicit kfunc argument aux. The returned scx_sched is
+ * RCU protected.
+ */
+static inline struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux)
+{
+ struct sched_ext_ops *ops;
+ struct scx_sched *root;
+
+ ops = bpf_prog_get_assoc_struct_ops(aux);
+ if (likely(ops))
+ return rcu_dereference_all(ops->priv);
+
+ root = rcu_dereference_all(scx_root);
+ if (root) {
+ /*
+ * COMPAT-v6.19: Schedulers built before sub-sched support was
+ * introduced may have unassociated non-struct_ops programs.
+ */
+ if (!root->ops.sub_attach)
+ return root;
+
+ if (!root->warned_unassoc_progs) {
+ printk_deferred(KERN_WARNING "sched_ext: Unassociated program %s (id %d)\n",
+ aux->name, aux->id);
+ root->warned_unassoc_progs = true;
+ }
+ }
+
+ return NULL;
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static inline struct scx_sched *scx_task_sched(const struct task_struct *p)
+{
+ return rcu_dereference_protected(scx_root,
+ lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(__rq_lockp(task_rq(p))));
+}
+
+static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p)
+{
+ return rcu_dereference_all(scx_root);
+}
+
+static inline bool scx_task_on_sched(struct scx_sched *sch,
+ const struct task_struct *p)
{
- return !current->scx.kf_mask;
+ return true;
}
-static inline bool scx_rq_bypassing(struct rq *rq)
+static struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux)
{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+ return rcu_dereference_all(scx_root);
}
+#endif /* CONFIG_EXT_SUB_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eea99ec01a3f..69361c63353a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -225,6 +225,7 @@ void __init sched_init_granularity(void)
update_sysctl();
}
+#ifndef CONFIG_64BIT
#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
@@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+#else
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
+{
+ return (delta_exec * weight) / lw->weight;
+}
+#endif
/*
* delta /= w
@@ -589,6 +596,21 @@ static inline bool entity_before(const struct sched_entity *a,
return vruntime_cmp(a->deadline, "<", b->deadline);
}
+/*
+ * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
+ * and this value should be no more than two lag bounds. Which puts it in the
+ * general order of:
+ *
+ * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
+ *
+ * which is around 44 bits in size (on 64bit); that is 20 for
+ * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
+ * however many msec the actual slice+tick ends up begin.
+ *
+ * (disregarding the actual divide-by-weight part makes for the worst case
+ * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
+ * being the zero-lag point).
+ */
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
@@ -650,25 +672,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Since zero_vruntime closely tracks the per-task service, these
* deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
* induced in the system due to quantisation.
- *
- * Also, we use scale_load_down() to reduce the size.
- *
- * As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
+static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
+{
+#ifdef CONFIG_64BIT
+ if (cfs_rq->sum_shift)
+ w = max(2UL, w >> cfs_rq->sum_shift);
+#endif
+ return w;
+}
+
+static inline void
+__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+ s64 w_vruntime, key = entity_key(cfs_rq, se);
+
+ w_vruntime = key * weight;
+ WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));
+
+ cfs_rq->sum_w_vruntime += w_vruntime;
+ cfs_rq->sum_weight += weight;
+}
+
static void
-sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned long weight = scale_load_down(se->load.weight);
- s64 key = entity_key(cfs_rq, se);
+ unsigned long weight;
+ s64 key, tmp;
+
+again:
+ weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+ key = entity_key(cfs_rq, se);
+
+ if (check_mul_overflow(key, weight, &key))
+ goto overflow;
- cfs_rq->sum_w_vruntime += key * weight;
+ if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
+ goto overflow;
+
+ cfs_rq->sum_w_vruntime = tmp;
cfs_rq->sum_weight += weight;
+ return;
+
+overflow:
+ /*
+ * There's gotta be a limit -- if we're still failing at this point
+ * there's really nothing much to be done about things.
+ */
+ BUG_ON(cfs_rq->sum_shift >= 10);
+ cfs_rq->sum_shift++;
+
+ /*
+ * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
+ */
+ cfs_rq->sum_w_vruntime = 0;
+ cfs_rq->sum_weight = 0;
+
+ for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
+ node; node = rb_next(node))
+ __sum_w_vruntime_add(cfs_rq, __node_2_se(node));
+
+ goto again;
+}
+
+static void
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (sched_feat(PARANOID_AVG))
+ return sum_w_vruntime_add_paranoid(cfs_rq, se);
+
+ __sum_w_vruntime_add(cfs_rq, se);
}
static void
sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned long weight = scale_load_down(se->load.weight);
+ unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
s64 key = entity_key(cfs_rq, se);
cfs_rq->sum_w_vruntime -= key * weight;
@@ -676,41 +756,65 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static inline
-void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
+ * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
*/
cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
+ cfs_rq->zero_vruntime += delta;
}
/*
- * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
* For this to be so, the result of this function must have a left bias.
+ *
+ * Called in:
+ * - place_entity() -- before enqueue
+ * - update_entity_lag() -- before dequeue
+ * - update_deadline() -- slice expiration
+ *
+ * This means it is one entry 'behind' but that puts it close enough to where
+ * the bound on entity_key() is at most two lag bounds.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->sum_w_vruntime;
- long load = cfs_rq->sum_weight;
+ long weight = cfs_rq->sum_weight;
+ s64 delta = 0;
- if (curr && curr->on_rq) {
- unsigned long weight = scale_load_down(curr->load.weight);
+ if (curr && !curr->on_rq)
+ curr = NULL;
- avg += entity_key(cfs_rq, curr) * weight;
- load += weight;
- }
+ if (weight) {
+ s64 runtime = cfs_rq->sum_w_vruntime;
+
+ if (curr) {
+ unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
+
+ runtime += entity_key(cfs_rq, curr) * w;
+ weight += w;
+ }
- if (load) {
/* sign flips effective floor / ceiling */
- if (avg < 0)
- avg -= (load - 1);
- avg = div_s64(avg, load);
+ if (runtime < 0)
+ runtime -= (weight - 1);
+
+ delta = div64_long(runtime, weight);
+ } else if (curr) {
+ /*
+ * When there is but one element, it is the average.
+ */
+ delta = curr->vruntime - cfs_rq->zero_vruntime;
}
- return cfs_rq->zero_vruntime + avg;
+ update_zero_vruntime(cfs_rq, delta);
+
+ return cfs_rq->zero_vruntime;
}
+static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
+
/*
* lag_i = S - s_i = w_i * (V - v_i)
*
@@ -724,19 +828,45 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
* EEVDF gives the following limit for a steady state system:
*
* -r_max < lag < max(r_max, q)
- *
- * XXX could add max_slice to the augmented data to track this.
*/
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime)
{
+ u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
s64 vlag, limit;
+ vlag = avruntime - se->vruntime;
+ limit = calc_delta_fair(max_slice, se);
+
+ return clamp(vlag, -limit, limit);
+}
+
+/*
+ * Delayed dequeue aims to reduce the negative lag of a dequeued task. While
+ * updating the lag of an entity, check that negative lag didn't increase
+ * during the delayed dequeue period which would be unfair.
+ * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO
+ * is set.
+ *
+ * Return true if the lag has been adjusted.
+ */
+static __always_inline
+bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
+ bool ret;
+
WARN_ON_ONCE(!se->on_rq);
- vlag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ if (se->sched_delayed) {
+ /* previous vlag < 0 otherwise se would not be delayed */
+ vlag = max(vlag, se->vlag);
+ if (sched_feat(DELAY_ZERO))
+ vlag = min(vlag, 0);
+ }
+ ret = (vlag == se->vlag);
+ se->vlag = vlag;
- se->vlag = clamp(vlag, -limit, limit);
+ return ret;
}
/*
@@ -763,7 +893,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
- unsigned long weight = scale_load_down(curr->load.weight);
+ unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
avg += entity_key(cfs_rq, curr) * weight;
load += weight;
@@ -777,16 +907,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static void update_zero_vruntime(struct cfs_rq *cfs_rq)
-{
- u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
-
- sum_w_vruntime_update(cfs_rq, delta);
-
- cfs_rq->zero_vruntime = vruntime;
-}
-
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
{
struct sched_entity *root = __pick_root_entity(cfs_rq);
@@ -802,6 +922,21 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
return min_slice;
}
+static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 max_slice = 0ULL;
+
+ if (curr && curr->on_rq)
+ max_slice = curr->slice;
+
+ if (root)
+ max_slice = max(max_slice, root->max_slice);
+
+ return max_slice;
+}
+
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
@@ -826,6 +961,15 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n
}
}
+static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->max_slice > se->max_slice)
+ se->max_slice = rse->max_slice;
+ }
+}
+
/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
@@ -833,6 +977,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
u64 old_min_vruntime = se->min_vruntime;
u64 old_min_slice = se->min_slice;
+ u64 old_max_slice = se->max_slice;
struct rb_node *node = &se->run_node;
se->min_vruntime = se->vruntime;
@@ -843,8 +988,13 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
__min_slice_update(se, node->rb_right);
__min_slice_update(se, node->rb_left);
+ se->max_slice = se->slice;
+ __max_slice_update(se, node->rb_right);
+ __max_slice_update(se, node->rb_left);
+
return se->min_vruntime == old_min_vruntime &&
- se->min_slice == old_min_slice;
+ se->min_slice == old_min_slice &&
+ se->max_slice == old_max_slice;
}
RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
@@ -856,7 +1006,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sum_w_vruntime_add(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -868,7 +1017,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
sum_w_vruntime_sub(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -968,7 +1116,7 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
/*
* Picking the ->next buddy will affect latency but not fairness.
*/
- if (sched_feat(PICK_BUDDY) &&
+ if (sched_feat(PICK_BUDDY) && protect &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
/* ->next will never be delayed */
WARN_ON_ONCE(cfs_rq->next->sched_delayed);
@@ -1075,6 +1223,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
* EEVDF: vd_i = ve_i + r_i / w_i
*/
se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
+ avg_vruntime(cfs_rq);
/*
* The task has consumed its request, reschedule.
@@ -3784,19 +3933,128 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
se_weight(se) * -se->avg.load_sum);
}
-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+static void
+rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
+{
+ unsigned long old_weight = se->load.weight;
+
+ /*
+ * VRUNTIME
+ * --------
+ *
+ * COROLLARY #1: The virtual runtime of the entity needs to be
+ * adjusted if re-weight at !0-lag point.
+ *
+ * Proof: For contradiction assume this is not true, so we can
+ * re-weight without changing vruntime at !0-lag point.
+ *
+ * Weight VRuntime Avg-VRuntime
+ * before w v V
+ * after w' v' V'
+ *
+ * Since lag needs to be preserved through re-weight:
+ *
+ * lag = (V - v)*w = (V'- v')*w', where v = v'
+ * ==> V' = (V - v)*w/w' + v (1)
+ *
+ * Let W be the total weight of the entities before reweight,
+ * since V' is the new weighted average of entities:
+ *
+ * V' = (WV + w'v - wv) / (W + w' - w) (2)
+ *
+ * by using (1) & (2) we obtain:
+ *
+ * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+ * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+ * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+ * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+ *
+ * Since we are doing at !0-lag point which means V != v, we
+ * can simplify (3):
+ *
+ * ==> W / (W + w' - w) = w / w'
+ * ==> Ww' = Ww + ww' - ww
+ * ==> W * (w' - w) = w * (w' - w)
+ * ==> W = w (re-weight indicates w' != w)
+ *
+ * So the cfs_rq contains only one entity, hence vruntime of
+ * the entity @v should always equal to the cfs_rq's weighted
+ * average vruntime @V, which means we will always re-weight
+ * at 0-lag point, thus breach assumption. Proof completed.
+ *
+ *
+ * COROLLARY #2: Re-weight does NOT affect weighted average
+ * vruntime of all the entities.
+ *
+ * Proof: According to corollary #1, Eq. (1) should be:
+ *
+ * (V - v)*w = (V' - v')*w'
+ * ==> v' = V' - (V - v)*w/w' (4)
+ *
+ * According to the weighted average formula, we have:
+ *
+ * V' = (WV - wv + w'v') / (W - w + w')
+ * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+ * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+ * = (WV + w'V' - Vw) / (W - w + w')
+ *
+ * ==> V'*(W - w + w') = WV + w'V' - Vw
+ * ==> V' * (W - w) = (W - w) * V (5)
+ *
+ * If the entity is the only one in the cfs_rq, then reweight
+ * always occurs at 0-lag point, so V won't change. Or else
+ * there are other entities, hence W != w, then Eq. (5) turns
+ * into V' = V. So V won't change in either case, proof done.
+ *
+ *
+ * So according to corollary #1 & #2, the effect of re-weight
+ * on vruntime should be:
+ *
+ * v' = V' - (V - v) * w / w' (4)
+ * = V - (V - v) * w / w'
+ * = V - vl * w / w'
+ * = V - vl'
+ */
+ se->vlag = div64_long(se->vlag * old_weight, weight);
+
+ /*
+ * DEADLINE
+ * --------
+ *
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ *
+ * d' = v' + (d - v)*w/w'
+ * = V' - (V - v)*w/w' + (d - v)*w/w'
+ * = V - (V - v)*w/w' + (d - v)*w/w'
+ * = V + (d - V)*w/w'
+ */
+ if (se->rel_deadline)
+ se->deadline = div64_long(se->deadline * old_weight, weight);
+
+ if (rel_vprot)
+ se->vprot = div64_long(se->vprot * old_weight, weight);
+}
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
+ bool rel_vprot = false;
+ u64 avruntime = 0;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
- update_entity_lag(cfs_rq, se);
- se->deadline -= se->vruntime;
+ avruntime = avg_vruntime(cfs_rq);
+ se->vlag = entity_lag(cfs_rq, se, avruntime);
+ se->deadline -= avruntime;
se->rel_deadline = 1;
+ if (curr && protect_slice(se)) {
+ se->vprot -= avruntime;
+ rel_vprot = true;
+ }
+
cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
@@ -3804,25 +4062,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
dequeue_load_avg(cfs_rq, se);
- /*
- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
- * we need to scale se->vlag when w_i changes.
- */
- se->vlag = div_s64(se->vlag * se->load.weight, weight);
- if (se->rel_deadline)
- se->deadline = div_s64(se->deadline * se->load.weight, weight);
+ rescale_entity(se, weight, rel_vprot);
update_load_set(&se->load, weight);
do {
u32 divider = get_pelt_divider(&se->avg);
-
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- place_entity(cfs_rq, se, 0);
+ if (rel_vprot)
+ se->vprot += avruntime;
+ se->deadline += avruntime;
+ se->rel_deadline = 0;
+ se->vruntime = avruntime - se->vlag;
+
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -5096,6 +5352,7 @@ static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 vslice, vruntime = avg_vruntime(cfs_rq);
+ bool update_zero = false;
s64 lag = 0;
if (!se->custom_slice)
@@ -5112,7 +5369,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
- unsigned long load;
+ long load, weight;
lag = se->vlag;
@@ -5170,17 +5427,44 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
- load += scale_load_down(curr->load.weight);
+ load += avg_vruntime_weight(cfs_rq, curr->load.weight);
- lag *= load + scale_load_down(se->load.weight);
+ weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+ lag *= load + weight;
if (WARN_ON_ONCE(!load))
load = 1;
- lag = div_s64(lag, load);
+ lag = div64_long(lag, load);
+
+ /*
+ * A heavy entity (relative to the tree) will pull the
+ * avg_vruntime close to its vruntime position on enqueue. But
+ * the zero_vruntime point is only updated at the next
+ * update_deadline()/place_entity()/update_entity_lag().
+ *
+ * Specifically (see the comment near avg_vruntime_weight()):
+ *
+ * sum_w_vruntime = \Sum (v_i - v0) * w_i
+ *
+ * Note that if v0 is near a light entity, both terms will be
+ * small for the light entity, while in that case both terms
+ * are large for the heavy entity, leading to risk of
+ * overflow.
+ *
+ * OTOH if v0 is near the heavy entity, then the difference is
+ * larger for the light entity, but the factor is small, while
+ * for the heavy entity the difference is small but the factor
+ * is large. Avoiding the multiplication overflow.
+ */
+ if (weight > load)
+ update_zero = true;
}
se->vruntime = vruntime - lag;
- if (se->rel_deadline) {
+ if (update_zero)
+ update_zero_vruntime(cfs_rq, -lag);
+
+ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
@@ -5330,13 +5614,6 @@ static void clear_delayed(struct sched_entity *se)
}
}
-static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
-{
- clear_delayed(se);
- if (sched_feat(DELAY_ZERO) && se->vlag > 0)
- se->vlag = 0;
-}
-
static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -5362,6 +5639,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
update_load_avg(cfs_rq, se, 0);
+ update_entity_lag(cfs_rq, se);
set_delayed(se);
return false;
}
@@ -5401,7 +5679,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_group(se);
if (flags & DEQUEUE_DELAYED)
- finish_delayed_dequeue_entity(se);
+ clear_delayed(se);
if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
@@ -5420,7 +5698,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
}
static void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first)
{
clear_buddies(cfs_rq, se);
@@ -5435,7 +5713,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- set_protect_slice(cfs_rq, se);
+ if (first)
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
@@ -5530,7 +5809,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr_lazy(rq_of(cfs_rq));
+ resched_curr(rq_of(cfs_rq));
return;
}
#endif
@@ -6735,27 +7014,41 @@ static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
+ unsigned long scale = 1024;
+ unsigned long util = 0;
+ u64 vdelta;
+ u64 delta;
WARN_ON_ONCE(task_rq(p) != rq);
- if (rq->cfs.h_nr_queued > 1) {
- u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
- u64 slice = se->slice;
- s64 delta = slice - ran;
+ if (rq->cfs.h_nr_queued <= 1)
+ return;
- if (delta < 0) {
- if (task_current_donor(rq, p))
- resched_curr(rq);
- return;
- }
- hrtick_start(rq, delta);
+ /*
+ * Compute time until virtual deadline
+ */
+ vdelta = se->deadline - se->vruntime;
+ if ((s64)vdelta < 0) {
+ if (task_current_donor(rq, p))
+ resched_curr(rq);
+ return;
+ }
+ delta = (se->load.weight * vdelta) / NICE_0_LOAD;
+
+ /*
+ * Correct for instantaneous load of other classes.
+ */
+ util += cpu_util_irq(rq);
+ if (util && util < 1024) {
+ scale *= 1024;
+ scale /= (1024 - util);
}
+
+ hrtick_start(rq, (scale * delta) / 1024);
}
/*
- * called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
+ * Called on enqueue to start the hrtick when h_nr_queued becomes more than 1.
*/
static void hrtick_update(struct rq *rq)
{
@@ -6764,6 +7057,9 @@ static void hrtick_update(struct rq *rq)
if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class)
return;
+ if (hrtick_active(rq))
+ return;
+
hrtick_start_fair(rq, donor);
}
#else /* !CONFIG_SCHED_HRTICK: */
@@ -6779,16 +7075,15 @@ static inline void hrtick_update(struct rq *rq)
static inline bool cpu_overutilized(int cpu)
{
- unsigned long rq_util_min, rq_util_max;
+ unsigned long rq_util_max;
if (!sched_energy_enabled())
return false;
- rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
/* Return true only if the utilization doesn't fit CPU's capacity */
- return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+ return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu);
}
/*
@@ -6826,9 +7121,15 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-static int sched_idle_cpu(int cpu)
+static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
+{
+ return sched_idle_rq(rq) && !task_has_idle_policy(p);
+}
+
+static int choose_idle_cpu(int cpu, struct task_struct *p)
{
- return sched_idle_rq(cpu_rq(cpu));
+ return available_idle_cpu(cpu) ||
+ choose_sched_idle_rq(cpu_rq(cpu), p);
}
static void
@@ -6844,18 +7145,14 @@ requeue_delayed_entity(struct sched_entity *se)
WARN_ON_ONCE(!se->sched_delayed);
WARN_ON_ONCE(!se->on_rq);
- if (sched_feat(DELAY_ZERO)) {
- update_entity_lag(cfs_rq, se);
- if (se->vlag > 0) {
- cfs_rq->nr_queued--;
- if (se != cfs_rq->curr)
- __dequeue_entity(cfs_rq, se);
- se->vlag = 0;
- place_entity(cfs_rq, se, 0);
- if (se != cfs_rq->curr)
- __enqueue_entity(cfs_rq, se);
- cfs_rq->nr_queued++;
- }
+ if (update_entity_lag(cfs_rq, se)) {
+ cfs_rq->nr_queued--;
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ place_entity(cfs_rq, se, 0);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
}
update_load_avg(cfs_rq, se, 0);
@@ -7086,9 +7383,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
WARN_ON_ONCE(!task_sleep);
WARN_ON_ONCE(p->on_rq != 1);
- /* Fix-up what dequeue_task_fair() skipped */
- hrtick_update(rq);
-
/*
* Fix-up what block_task() skipped.
*
@@ -7122,8 +7416,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/*
* Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
*/
-
- hrtick_update(rq);
return true;
}
@@ -7393,7 +7685,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
if (!sched_core_cookie_match(rq, p))
continue;
- if (sched_idle_cpu(i))
+ if (choose_sched_idle_rq(rq, p))
return i;
if (available_idle_cpu(i)) {
@@ -7484,8 +7776,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
- if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
- sched_cpu_cookie_match(cpu_rq(cpu), p))
+ if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -7558,7 +7849,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
- if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+ if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
+ cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu;
break;
}
@@ -7593,7 +7885,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
*/
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if (choose_idle_cpu(cpu, p))
return cpu;
}
@@ -7632,21 +7924,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
- struct sched_domain_shared *sd_share;
-
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_UTIL)) {
- sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
- if (sd_share) {
- /* because !--nr is the condition to stop scan */
- nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
- /* overloaded LLC is unlikely to have idle cpu/core */
- if (nr == 1)
- return -1;
- }
+ /*
+ * Increment because !--nr is the condition to stop scan.
+ *
+ * Since "sd" is "sd_llc" for target CPU dereferenced in the
+ * caller, it is safe to directly dereference "sd->shared".
+ * Topology bits always ensure it assigned for "sd_llc" abd it
+ * cannot disappear as long as we have a RCU protected
+ * reference to one the associated "sd" here.
+ */
+ nr = READ_ONCE(sd->shared->nr_idle_scan) + 1;
+ /* overloaded LLC is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
}
+ if (!cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr))
+ return -1;
+
if (static_branch_unlikely(&sched_cluster_active)) {
struct sched_group *sg = sd->groups;
@@ -7715,7 +8012,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
- if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ if (!choose_idle_cpu(cpu, p))
continue;
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7786,7 +8083,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
lockdep_assert_irqs_disabled();
- if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ if (choose_idle_cpu(target, p) &&
asym_fits_cpu(task_util, util_min, util_max, target))
return target;
@@ -7794,7 +8091,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ choose_idle_cpu(prev, p) &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7826,7 +8123,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+ choose_idle_cpu(recent_used_cpu, p) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
@@ -8326,10 +8623,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct perf_domain *pd;
struct energy_env eenv;
- rcu_read_lock();
pd = rcu_dereference_all(rd->pd);
if (!pd)
- goto unlock;
+ return target;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -8339,13 +8635,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
- goto unlock;
+ return target;
target = prev_cpu;
sync_entity_load_avg(&p->se);
if (!task_util_est(p) && p_util_min == 0)
- goto unlock;
+ return target;
eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -8440,7 +8736,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
prev_cpu);
/* CPU utilization has changed */
if (prev_delta < base_energy)
- goto unlock;
+ return target;
prev_delta -= base_energy;
prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
@@ -8464,7 +8760,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
max_spare_cap_cpu);
/* CPU utilization has changed */
if (cur_delta < base_energy)
- goto unlock;
+ return target;
cur_delta -= base_energy;
/*
@@ -8481,7 +8777,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
best_actual_cap = cpu_actual_cap;
}
}
- rcu_read_unlock();
if ((best_fits > prev_fits) ||
((best_fits > 0) && (best_delta < prev_delta)) ||
@@ -8489,11 +8784,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = best_energy_cpu;
return target;
-
-unlock:
- rcu_read_unlock();
-
- return target;
}
/*
@@ -8538,7 +8828,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
}
- rcu_read_lock();
for_each_domain(cpu, tmp) {
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
@@ -8564,14 +8853,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break;
}
- if (unlikely(sd)) {
- /* Slow path */
- new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
- } else if (wake_flags & WF_TTWU) { /* XXX always ? */
- /* Fast path */
- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- }
- rcu_read_unlock();
+ /* Slow path */
+ if (unlikely(sd))
+ return sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
+
+ /* Fast path */
+ if (wake_flags & WF_TTWU)
+ return select_idle_sibling(p, prev_cpu, new_cpu);
return new_cpu;
}
@@ -8862,8 +9150,10 @@ pick:
return;
preempt:
- if (preempt_action == PREEMPT_WAKEUP_SHORT)
+ if (preempt_action == PREEMPT_WAKEUP_SHORT) {
cancel_protect_slice(se);
+ clear_buddies(cfs_rq, se);
+ }
resched_curr_lazy(rq);
}
@@ -8948,13 +9238,13 @@ again:
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se);
+ set_next_entity(cfs_rq_of(se), se, true);
se = parent_entity(se);
}
}
put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se);
+ set_next_entity(cfs_rq, se, true);
__set_next_task_fair(rq, p, true);
}
@@ -9054,7 +9344,7 @@ static void yield_task_fair(struct rq *rq)
*/
if (entity_eligible(cfs_rq, se)) {
se->vruntime = se->deadline;
- se->deadline += calc_delta_fair(se->slice, se);
+ update_deadline(cfs_rq, se);
}
}
@@ -9711,32 +10001,6 @@ next:
}
/*
- * attach_task() -- attach the task detached by detach_task() to its new rq.
- */
-static void attach_task(struct rq *rq, struct task_struct *p)
-{
- lockdep_assert_rq_held(rq);
-
- WARN_ON_ONCE(task_rq(p) != rq);
- activate_task(rq, p, ENQUEUE_NOCLOCK);
- wakeup_preempt(rq, p, 0);
-}
-
-/*
- * attach_one_task() -- attaches the task returned from detach_one_task() to
- * its new rq.
- */
-static void attach_one_task(struct rq *rq, struct task_struct *p)
-{
- struct rq_flags rf;
-
- rq_lock(rq, &rf);
- update_rq_clock(rq);
- attach_task(rq, p);
- rq_unlock(rq, &rf);
-}
-
-/*
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
* new rq.
*/
@@ -9973,6 +10237,7 @@ struct sg_lb_stats {
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -10205,6 +10470,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
static inline bool
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
+ /*
+ * With EAS and uclamp, 1 CPU in the group must be overutilized to
+ * consider the group overloaded.
+ */
+ if (sched_energy_enabled() && !sgs->group_overutilized)
+ return false;
+
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
@@ -10388,14 +10660,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_overloaded: sched_group is overloaded
- * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- bool *sg_overloaded,
- bool *sg_overutilized)
+ bool *sg_overloaded)
{
int i, nr_running, local_group, sd_flags = env->sd->flags;
bool balancing_at_rd = !env->sd->parent;
@@ -10417,7 +10687,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;
if (cpu_overutilized(i))
- *sg_overutilized = 1;
+ sgs->group_overutilized = 1;
/*
* No need to call idle_cpu() if nr_running is not 0
@@ -10993,6 +11263,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
unsigned long sum_util)
{
struct sched_domain_shared *sd_share;
+ struct sched_domain *sd = env->sd;
int llc_weight, pct;
u64 x, y, tmp;
/*
@@ -11006,11 +11277,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
return;
- llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
- if (env->sd->span_weight != llc_weight)
- return;
-
- sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
+ sd_share = sd->shared;
if (!sd_share)
return;
@@ -11044,10 +11311,11 @@ static void update_idle_cpu_scan(struct lb_env *env,
*/
/* equation [3] */
x = sum_util;
+ llc_weight = sd->span_weight;
do_div(x, llc_weight);
/* equation [4] */
- pct = env->sd->imbalance_pct;
+ pct = sd->imbalance_pct;
tmp = x * x * pct * pct;
do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
@@ -11088,13 +11356,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
+ sg_overutilized |= sgs->group_overutilized;
+
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -12215,7 +12485,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
sd->newidle_success += success;
if (sd->newidle_call >= 1024) {
- sd->newidle_ratio = sd->newidle_success;
+ u64 now = sched_clock();
+ s64 delta = now - sd->newidle_stamp;
+ sd->newidle_stamp = now;
+ int ratio = 0;
+
+ if (delta < 0)
+ delta = 0;
+
+ if (sched_feat(NI_RATE)) {
+ /*
+ * ratio delta freq
+ *
+ * 1024 - 4 s - 128 Hz
+ * 512 - 2 s - 256 Hz
+ * 256 - 1 s - 512 Hz
+ * 128 - .5 s - 1024 Hz
+ * 64 - .25 s - 2048 Hz
+ */
+ ratio = delta >> 22;
+ }
+
+ ratio += sd->newidle_success;
+
+ sd->newidle_ratio = min(1024, ratio);
sd->newidle_call /= 2;
sd->newidle_success /= 2;
}
@@ -12262,7 +12555,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
- int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -12300,7 +12593,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu);
- busy = !idle && !sched_idle_cpu(cpu);
+ busy = !idle && !sched_idle_rq(rq);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
@@ -12345,14 +12638,14 @@ static inline int on_null_domain(struct rq *rq)
*/
static inline int find_new_ilb(void)
{
+ int this_cpu = smp_processor_id();
const struct cpumask *hk_mask;
int ilb_cpu;
hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
-
- if (ilb_cpu == smp_processor_id())
+ if (ilb_cpu == this_cpu)
continue;
if (idle_cpu(ilb_cpu))
@@ -12908,7 +13201,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t0 = sched_clock_cpu(this_cpu);
__sched_balance_update_blocked_averages(this_rq);
- this_rq->next_class = &fair_sched_class;
+ rq_modified_begin(this_rq, &fair_sched_class);
raw_spin_rq_unlock(this_rq);
for_each_domain(this_cpu, sd) {
@@ -12922,7 +13215,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
unsigned int weight = 1;
- if (sched_feat(NI_RANDOM)) {
+ if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
/*
* Throw a 1k sided dice; and only run
* newidle_balance according to the success
@@ -12975,7 +13268,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (sched_class_above(this_rq->next_class, &fair_sched_class))
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -13365,11 +13658,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
- if (queued) {
- if (!need_resched())
- hrtick_start_fair(rq, curr);
+ if (queued)
return;
- }
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
@@ -13568,7 +13858,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- set_next_entity(cfs_rq, se);
+ set_next_entity(cfs_rq, se, first);
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
@@ -13951,7 +14241,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
}
if (ng) {
- gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 136a6584be79..84c4fe3abd74 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -58,13 +58,20 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(DELAY_DEQUEUE, true)
SCHED_FEAT(DELAY_ZERO, true)
+SCHED_FEAT(PARANOID_AVG, false)
+
/*
* Allow wakeup-time preemption of the current task:
*/
SCHED_FEAT(WAKEUP_PREEMPTION, true)
+#ifdef CONFIG_HRTIMER_REARM_DEFERRED
+SCHED_FEAT(HRTICK, true)
+SCHED_FEAT(HRTICK_DL, true)
+#else
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(HRTICK_DL, false)
+#endif
/*
* Decrement CPU capacity based on time not spent running tasks
@@ -126,3 +133,4 @@ SCHED_FEAT(LATENCY_WARN, false)
* Do newidle balancing proportional to its success rate using randomization.
*/
SCHED_FEAT(NI_RANDOM, true)
+SCHED_FEAT(NI_RATE, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3681b6ad9276..a83be0c834dd 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -161,6 +161,14 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
return cpuidle_enter(drv, dev, next_state);
}
+static void idle_call_stop_or_retain_tick(bool stop_tick)
+{
+ if (stop_tick || tick_nohz_tick_stopped())
+ tick_nohz_idle_stop_tick();
+ else
+ tick_nohz_idle_retain_tick();
+}
+
/**
* cpuidle_idle_call - the main idle function
*
@@ -170,7 +178,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
-static void cpuidle_idle_call(void)
+static void cpuidle_idle_call(bool stop_tick)
{
struct cpuidle_device *dev = cpuidle_get_device();
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
@@ -186,7 +194,7 @@ static void cpuidle_idle_call(void)
}
if (cpuidle_not_available(drv, dev)) {
- tick_nohz_idle_stop_tick();
+ idle_call_stop_or_retain_tick(stop_tick);
default_idle_call();
goto exit_idle;
@@ -221,24 +229,35 @@ static void cpuidle_idle_call(void)
next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
- } else {
- bool stop_tick = true;
+ } else if (drv->state_count > 1) {
+ /*
+ * stop_tick is expected to be true by default by cpuidle
+ * governors, which allows them to select idle states with
+ * target residency above the tick period length.
+ */
+ stop_tick = true;
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
next_state = cpuidle_select(drv, dev, &stop_tick);
- if (stop_tick || tick_nohz_tick_stopped())
- tick_nohz_idle_stop_tick();
- else
- tick_nohz_idle_retain_tick();
+ idle_call_stop_or_retain_tick(stop_tick);
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
*/
cpuidle_reflect(dev, entered_state);
+ } else {
+ idle_call_stop_or_retain_tick(stop_tick);
+
+ /*
+ * If there is only a single idle state (or none), there is
+ * nothing meaningful for the governor to choose. Skip the
+ * governor and always use state 0.
+ */
+ call_cpuidle(drv, dev, 0);
}
exit_idle:
@@ -259,6 +278,7 @@ exit_idle:
static void do_idle(void)
{
int cpu = smp_processor_id();
+ bool got_tick = false;
/*
* Check if we need to update blocked load
@@ -329,8 +349,9 @@ static void do_idle(void)
tick_nohz_idle_restart_tick();
cpu_idle_poll();
} else {
- cpuidle_idle_call();
+ cpuidle_idle_call(got_tick);
}
+ got_tick = tick_nohz_idle_got_tick();
arch_cpu_idle_exit();
}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3b725d39c06e..ef152d401fe2 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -123,8 +123,6 @@ int housekeeping_update(struct cpumask *isol_mask)
struct cpumask *trial, *old = NULL;
int err;
- lockdep_assert_cpus_held();
-
trial = kmalloc(cpumask_size(), GFP_KERNEL);
if (!trial)
return -ENOMEM;
@@ -136,7 +134,7 @@ int housekeeping_update(struct cpumask *isol_mask)
}
if (!housekeeping.flags)
- static_branch_enable_cpuslocked(&housekeeping_overridden);
+ static_branch_enable(&housekeeping_overridden);
if (housekeeping.flags & HK_FLAG_DOMAIN)
old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f69e1f16d923..4ee8faf01441 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
int flags)
{
struct task_struct *p = NULL;
+ struct rq *rq = rq_of_rt_rq(rt_rq);
if (!schedstat_enabled())
return;
- if (rt_entity_is_task(rt_se))
+ if (rt_entity_is_task(rt_se)) {
p = rt_task_of(rt_se);
+ if (p != rq->curr)
+ update_stats_wait_end_rt(rt_rq, rt_se);
+ }
+
if ((flags & DEQUEUE_SLEEP) && p) {
unsigned int state;
@@ -1853,13 +1858,22 @@ static int find_lowest_rq(struct task_struct *task)
static struct task_struct *pick_next_pushable_task(struct rq *rq)
{
- struct task_struct *p;
+ struct plist_head *head = &rq->rt.pushable_tasks;
+ struct task_struct *i, *p = NULL;
if (!has_pushable_tasks(rq))
return NULL;
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
+ plist_for_each_entry(i, head, pushable_tasks) {
+ /* make sure task isn't on_cpu (possible with proxy-exec) */
+ if (!task_on_cpu(rq, i)) {
+ p = i;
+ break;
+ }
+ }
+
+ if (!p)
+ return NULL;
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
@@ -2652,7 +2666,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
{
struct rt_schedulable_data *d = data;
struct task_group *child;
- unsigned long total, sum = 0;
+ u64 total, sum = 0;
u64 period, runtime;
period = ktime_to_ns(tg->rt_bandwidth.rt_period);
@@ -2676,9 +2690,6 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
return -EBUSY;
- if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
- return -EBUSY;
-
total = to_ratio(period, runtime);
/*
@@ -2818,19 +2829,6 @@ long sched_group_rt_period(struct task_group *tg)
return rt_period_us;
}
-#ifdef CONFIG_SYSCTL
-static int sched_rt_global_constraints(void)
-{
- int ret = 0;
-
- mutex_lock(&rt_constraints_mutex);
- ret = __rt_schedulable(NULL, 0, 0);
- mutex_unlock(&rt_constraints_mutex);
-
- return ret;
-}
-#endif /* CONFIG_SYSCTL */
-
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept real-time tasks when there is no way for them to run */
@@ -2840,14 +2838,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
return 1;
}
-#else /* !CONFIG_RT_GROUP_SCHED: */
-
-#ifdef CONFIG_SYSCTL
-static int sched_rt_global_constraints(void)
-{
- return 0;
-}
-#endif /* CONFIG_SYSCTL */
#endif /* !CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SYSCTL
@@ -2859,11 +2849,14 @@ static int sched_rt_global_validate(void)
NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;
- return 0;
-}
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (!rt_group_sched_enabled())
+ return 0;
-static void sched_rt_do_global(void)
-{
+ scoped_guard(mutex, &rt_constraints_mutex)
+ return __rt_schedulable(NULL, 0, 0);
+#endif
+ return 0;
}
static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
@@ -2889,11 +2882,6 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
if (ret)
goto undo;
- ret = sched_rt_global_constraints();
- if (ret)
- goto undo;
-
- sched_rt_do_global();
sched_dl_do_global();
}
if (0) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b82fb70a9d54..9f63b15d309d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
-extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
+extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
@@ -684,8 +684,9 @@ struct cfs_rq {
s64 sum_w_vruntime;
u64 sum_weight;
-
u64 zero_vruntime;
+ unsigned int sum_shift;
+
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
u64 zero_vruntime_fi;
@@ -782,7 +783,6 @@ enum scx_rq_flags {
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
- SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */
@@ -798,19 +798,29 @@ struct scx_rq {
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
+ bool in_select_cpu;
bool cpu_released;
u32 flags;
+ u32 nr_immed; /* ENQ_IMMED tasks on local_dsq */
u64 clock; /* current per-rq clock -- see scx_bpf_now() */
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
+ cpumask_var_t cpus_to_sync;
+ bool kick_sync_pending;
unsigned long kick_sync;
- local_t reenq_local_deferred;
+
+ struct task_struct *sub_dispatch_prev;
+
+ raw_spinlock_t deferred_reenq_lock;
+ u64 deferred_reenq_locals_seq;
+ struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */
+ struct list_head deferred_reenq_users; /* user DSQs requesting reenq */
struct balance_callback deferred_bal_cb;
+ struct balance_callback kick_sync_bal_cb;
struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
- struct scx_dispatch_q bypass_dsq;
};
#endif /* CONFIG_SCHED_CLASS_EXT */
@@ -1285,6 +1295,8 @@ struct rq {
call_single_data_t hrtick_csd;
struct hrtimer hrtick_timer;
ktime_t hrtick_time;
+ ktime_t hrtick_delay;
+ unsigned int hrtick_sched;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -1606,15 +1618,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
extern bool raw_spin_rq_trylock(struct rq *rq)
__cond_acquires(true, __rq_lockp(rq));
-extern void raw_spin_rq_unlock(struct rq *rq)
- __releases(__rq_lockp(rq));
-
static inline void raw_spin_rq_lock(struct rq *rq)
__acquires(__rq_lockp(rq))
{
raw_spin_rq_lock_nested(rq, 0);
}
+static inline void raw_spin_rq_unlock(struct rq *rq)
+ __releases(__rq_lockp(rq))
+{
+ raw_spin_unlock(rq_lockp(rq));
+}
+
static inline void raw_spin_rq_lock_irq(struct rq *rq)
__acquires(__rq_lockp(rq))
{
@@ -1853,6 +1868,13 @@ static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
static inline void scx_rq_clock_invalidate(struct rq *rq) {}
#endif /* !CONFIG_SCHED_CLASS_EXT */
+static inline void assert_balance_callbacks_empty(struct rq *rq)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ rq->balance_callback &&
+ rq->balance_callback != &balance_push_callback);
+}
+
/*
* Lockdep annotation that avoids accidental unlocks; it's like a
* sticky/continuous lockdep_assert_held().
@@ -1869,7 +1891,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
- WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
+ assert_balance_callbacks_empty(rq);
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
@@ -2748,6 +2770,17 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
#define sched_class_above(_a, _b) ((_a) < (_b))
+static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class)
+{
+ if (sched_class_above(rq->next_class, class))
+ rq->next_class = class;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class)
+{
+ return sched_class_above(rq->next_class, class);
+}
+
static inline bool sched_stop_runnable(struct rq *rq)
{
return rq->stop && task_on_rq_queued(rq->stop);
@@ -2838,7 +2871,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
+ lockdep_assert(rcu_read_lock_any_held());
return rq->idle_state;
}
@@ -2885,7 +2918,7 @@ extern void init_cfs_throttle_work(struct task_struct *p);
#define MAX_BW_BITS (64 - BW_SHIFT)
#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
-extern unsigned long to_ratio(u64 period, u64 runtime);
+extern u64 to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
extern void post_init_entity_util_avg(struct task_struct *p);
@@ -2990,6 +3023,29 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static inline void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(task_rq(p) != rq);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
+ wakeup_preempt(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static inline void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ guard(rq_lock)(rq);
+ update_rq_clock(rq);
+ attach_task(rq, p);
+}
+
#ifdef CONFIG_PREEMPT_RT
# define SCHED_NR_MIGRATE_BREAK 8
#else
@@ -3019,46 +3075,31 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
* - enabled by features
* - hrtimer is actually high res
*/
-static inline int hrtick_enabled(struct rq *rq)
+static inline bool hrtick_enabled(struct rq *rq)
{
- if (!cpu_active(cpu_of(rq)))
- return 0;
- return hrtimer_is_hres_active(&rq->hrtick_timer);
+ return cpu_active(cpu_of(rq)) && hrtimer_highres_enabled();
}
-static inline int hrtick_enabled_fair(struct rq *rq)
+static inline bool hrtick_enabled_fair(struct rq *rq)
{
- if (!sched_feat(HRTICK))
- return 0;
- return hrtick_enabled(rq);
+ return sched_feat(HRTICK) && hrtick_enabled(rq);
}
-static inline int hrtick_enabled_dl(struct rq *rq)
+static inline bool hrtick_enabled_dl(struct rq *rq)
{
- if (!sched_feat(HRTICK_DL))
- return 0;
- return hrtick_enabled(rq);
+ return sched_feat(HRTICK_DL) && hrtick_enabled(rq);
}
extern void hrtick_start(struct rq *rq, u64 delay);
-
-#else /* !CONFIG_SCHED_HRTICK: */
-
-static inline int hrtick_enabled_fair(struct rq *rq)
-{
- return 0;
-}
-
-static inline int hrtick_enabled_dl(struct rq *rq)
+static inline bool hrtick_active(struct rq *rq)
{
- return 0;
-}
-
-static inline int hrtick_enabled(struct rq *rq)
-{
- return 0;
+ return hrtimer_active(&rq->hrtick_timer);
}
+#else /* !CONFIG_SCHED_HRTICK: */
+static inline bool hrtick_enabled_fair(struct rq *rq) { return false; }
+static inline bool hrtick_enabled_dl(struct rq *rq) { return false; }
+static inline bool hrtick_enabled(struct rq *rq) { return false; }
#endif /* !CONFIG_SCHED_HRTICK */
#ifndef arch_scale_freq_tick
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 6f10db3646e7..b215b0ead9a6 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -284,6 +284,35 @@ static bool check_same_owner(struct task_struct *p)
uid_eq(cred->euid, pcred->uid));
}
+#ifdef CONFIG_RT_MUTEXES
+static inline void __setscheduler_dl_pi(int newprio, int policy,
+ struct task_struct *p,
+ struct sched_change_ctx *scope)
+{
+ /*
+ * In case a DEADLINE task (either proper or boosted) gets
+ * setscheduled to a lower priority class, check if it neeeds to
+ * inherit parameters from a potential pi_task. In that case make
+ * sure replenishment happens with the next enqueue.
+ */
+
+ if (dl_prio(newprio) && !dl_policy(policy)) {
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ if (pi_task) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ }
+ }
+}
+#else /* !CONFIG_RT_MUTEXES */
+static inline void __setscheduler_dl_pi(int newprio, int policy,
+ struct task_struct *p,
+ struct sched_change_ctx *scope)
+{
+}
+#endif /* !CONFIG_RT_MUTEXES */
+
#ifdef CONFIG_UCLAMP_TASK
static int uclamp_validate(struct task_struct *p,
@@ -655,6 +684,7 @@ change:
__setscheduler_params(p, attr);
p->sched_class = next_class;
p->prio = newprio;
+ __setscheduler_dl_pi(newprio, policy, p, scope);
}
__setscheduler_uclamp(p, attr);
@@ -881,10 +911,10 @@ err_size:
return -E2BIG;
}
-static void get_params(struct task_struct *p, struct sched_attr *attr)
+static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
{
if (task_has_dl_policy(p)) {
- __getparam_dl(p, attr);
+ __getparam_dl(p, attr, flags);
} else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
} else {
@@ -950,7 +980,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
return -ESRCH;
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
+ get_params(p, &attr, 0);
return sched_setattr(p, &attr);
}
@@ -1035,7 +1065,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
int retval;
if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags))
+ usize < SCHED_ATTR_SIZE_VER0))
return -EINVAL;
scoped_guard (rcu) {
@@ -1043,6 +1073,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!p)
return -ESRCH;
+ if (flags) {
+ if (!task_has_dl_policy(p) ||
+ flags != SCHED_GETATTR_FLAG_DL_DYNAMIC)
+ return -EINVAL;
+ }
+
retval = security_task_getscheduler(p);
if (retval)
return retval;
@@ -1050,7 +1086,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
+ get_params(p, &kattr, flags);
kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32dcddaead82..5847b83d9d55 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
#include <linux/bsearch.h>
#include "sched.h"
@@ -272,7 +273,7 @@ void rebuild_sched_domains_energy(void)
static int sched_energy_aware_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret, state;
+ int ret;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -288,8 +289,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
- state = static_branch_unlikely(&sched_energy_present);
- if (state != sysctl_sched_energy_aware)
+ if (sysctl_sched_energy_aware != sched_energy_enabled())
rebuild_sched_domains_energy();
}
@@ -387,11 +387,11 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
static void sched_energy_set(bool has_eas)
{
- if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
+ if (!has_eas && sched_energy_enabled()) {
if (sched_debug())
pr_info("%s: stopping EAS\n", __func__);
static_branch_disable_cpuslocked(&sched_energy_present);
- } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
+ } else if (has_eas && !sched_energy_enabled()) {
if (sched_debug())
pr_info("%s: starting EAS\n", __func__);
static_branch_enable_cpuslocked(&sched_energy_present);
@@ -684,6 +684,9 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
+
+ /* If sd_llc exists, sd_llc_shared should exist too. */
+ WARN_ON_ONCE(!sd->shared);
sds = sd->shared;
}
@@ -732,6 +735,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
+ /* Pick reference to parent->shared. */
+ if (parent->shared) {
+ WARN_ON_ONCE(tmp->shared);
+ tmp->shared = parent->shared;
+ parent->shared = NULL;
+ }
+
if (parent->parent) {
parent->parent->child = tmp;
parent->parent->groups->flags = tmp->flags;
@@ -781,6 +791,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
}
struct s_data {
+ struct sched_domain_shared * __percpu *sds;
struct sched_domain * __percpu *sd;
struct root_domain *rd;
};
@@ -788,6 +799,7 @@ struct s_data {
enum s_alloc {
sa_rootdomain,
sa_sd,
+ sa_sd_shared,
sa_sd_storage,
sa_none,
};
@@ -1534,6 +1546,9 @@ static void set_domain_attribute(struct sched_domain *sd,
static void __sdt_free(const struct cpumask *cpu_map);
static int __sdt_alloc(const struct cpumask *cpu_map);
+static void __sds_free(struct s_data *d, const struct cpumask *cpu_map);
+static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map);
+
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
@@ -1545,6 +1560,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
case sa_sd:
free_percpu(d->sd);
fallthrough;
+ case sa_sd_shared:
+ __sds_free(d, cpu_map);
+ fallthrough;
case sa_sd_storage:
__sdt_free(cpu_map);
fallthrough;
@@ -1560,9 +1578,11 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
if (__sdt_alloc(cpu_map))
return sa_sd_storage;
+ if (__sds_alloc(d, cpu_map))
+ return sa_sd_shared;
d->sd = alloc_percpu(struct sched_domain *);
if (!d->sd)
- return sa_sd_storage;
+ return sa_sd_shared;
d->rd = alloc_rootdomain();
if (!d->rd)
return sa_sd;
@@ -1575,21 +1595,25 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
* sched_group structure so that the subsequent __free_domain_allocs()
* will not free the data we're using.
*/
-static void claim_allocations(int cpu, struct sched_domain *sd)
+static void claim_allocations(int cpu, struct s_data *d)
{
- struct sd_data *sdd = sd->private;
+ struct sched_domain *sd;
+
+ if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref))
+ *per_cpu_ptr(d->sds, cpu) = NULL;
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
- *per_cpu_ptr(sdd->sd, cpu) = NULL;
+ for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) {
+ struct sd_data *sdd = sd->private;
- if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
- *per_cpu_ptr(sdd->sds, cpu) = NULL;
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
- *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
- *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+ }
}
#ifdef CONFIG_NUMA
@@ -1642,14 +1666,19 @@ sd_init(struct sched_domain_topology_level *tl,
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
+ u64 now = sched_clock();
- sd_weight = cpumask_weight(tl->mask(tl, cpu));
+ sd_span = sched_domain_span(sd);
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
+ sd_weight = cpumask_weight(sd_span);
+ sd_id = cpumask_first(sd_span);
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
- "wrong sd_flags in topology description\n"))
+ "wrong sd_flags in topology description\n"))
sd_flags &= TOPOLOGY_SD_FLAGS;
+ sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
*sd = (struct sched_domain){
.min_interval = sd_weight,
@@ -1679,6 +1708,7 @@ sd_init(struct sched_domain_topology_level *tl,
.newidle_call = 512,
.newidle_success = 256,
.newidle_ratio = 512,
+ .newidle_stamp = now,
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
@@ -1686,12 +1716,6 @@ sd_init(struct sched_domain_topology_level *tl,
.name = tl->name,
};
- sd_span = sched_domain_span(sd);
- cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
- sd_id = cpumask_first(sd_span);
-
- sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
-
WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
(SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
"CPU capacity asymmetry not supported on SMT\n");
@@ -1727,16 +1751,6 @@ sd_init(struct sched_domain_topology_level *tl,
sd->cache_nice_tries = 1;
}
- /*
- * For all levels sharing cache; connect a sched_domain_shared
- * instance.
- */
- if (sd->flags & SD_SHARE_LLC) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
- atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
-
sd->private = sdd;
return sd;
@@ -2372,10 +2386,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sd)
return -ENOMEM;
- sdd->sds = alloc_percpu(struct sched_domain_shared *);
- if (!sdd->sds)
- return -ENOMEM;
-
sdd->sg = alloc_percpu(struct sched_group *);
if (!sdd->sg)
return -ENOMEM;
@@ -2386,7 +2396,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
- struct sched_domain_shared *sds;
struct sched_group *sg;
struct sched_group_capacity *sgc;
@@ -2397,13 +2406,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sd, j) = sd;
- sds = kzalloc_node(sizeof(struct sched_domain_shared),
- GFP_KERNEL, cpu_to_node(j));
- if (!sds)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sds, j) = sds;
-
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sg)
@@ -2445,8 +2447,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
kfree(*per_cpu_ptr(sdd->sd, j));
}
- if (sdd->sds)
- kfree(*per_cpu_ptr(sdd->sds, j));
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
if (sdd->sgc)
@@ -2454,8 +2454,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
free_percpu(sdd->sd);
sdd->sd = NULL;
- free_percpu(sdd->sds);
- sdd->sds = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
free_percpu(sdd->sgc);
@@ -2463,6 +2461,42 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}
+static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
+{
+ int j;
+
+ d->sds = alloc_percpu(struct sched_domain_shared *);
+ if (!d->sds)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain_shared *sds;
+
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sds)
+ return -ENOMEM;
+
+ *per_cpu_ptr(d->sds, j) = sds;
+ }
+
+ return 0;
+}
+
+static void __sds_free(struct s_data *d, const struct cpumask *cpu_map)
+{
+ int j;
+
+ if (!d->sds)
+ return;
+
+ for_each_cpu(j, cpu_map)
+ kfree(*per_cpu_ptr(d->sds, j));
+
+ free_percpu(d->sds);
+ d->sds = NULL;
+}
+
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
@@ -2549,6 +2583,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
}
/*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+static void adjust_numa_imbalance(struct sched_domain *sd_llc)
+{
+ struct sched_domain *parent;
+ unsigned int imb_span = 1;
+ unsigned int imb = 0;
+ unsigned int nr_llcs;
+
+ WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+ WARN_ON(!sd_llc->parent);
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
+ */
+ nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+ if (nr_llcs == 1)
+ imb = sd_llc->parent->span_weight >> 3;
+ else
+ imb = nr_llcs;
+
+ imb = max(1U, imb);
+ sd_llc->parent->imb_numa_nr = imb;
+
+ /*
+ * Set span based on the first NUMA domain.
+ *
+ * NUMA systems always add a NODE domain before
+ * iterating the NUMA domains. Since this is before
+ * degeneration, start from sd_llc's parent's
+ * parent which is the lowest an SD_NUMA domain can
+ * be relative to sd_llc.
+ */
+ parent = sd_llc->parent->parent;
+ while (parent && !(parent->flags & SD_NUMA))
+ parent = parent->parent;
+
+ imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
+
+ /* Update the upper remainder of the topology */
+ parent = sd_llc->parent;
+ while (parent) {
+ int factor = max(1U, (parent->span_weight / imb_span));
+
+ parent->imb_numa_nr = imb * factor;
+ parent = parent->parent;
+ }
+}
+
+/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
@@ -2605,61 +2707,28 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
- /*
- * Calculate an allowed NUMA imbalance such that LLCs do not get
- * imbalanced.
- */
for_each_cpu(i, cpu_map) {
- unsigned int imb = 0;
- unsigned int imb_span = 1;
+ sd = *per_cpu_ptr(d.sd, i);
+ if (!sd)
+ continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- struct sched_domain *child = sd->child;
-
- if (!(sd->flags & SD_SHARE_LLC) && child &&
- (child->flags & SD_SHARE_LLC)) {
- struct sched_domain __rcu *top_p;
- unsigned int nr_llcs;
-
- /*
- * For a single LLC per node, allow an
- * imbalance up to 12.5% of the node. This is
- * arbitrary cutoff based two factors -- SMT and
- * memory channels. For SMT-2, the intent is to
- * avoid premature sharing of HT resources but
- * SMT-4 or SMT-8 *may* benefit from a different
- * cutoff. For memory channels, this is a very
- * rough estimate of how many channels may be
- * active and is based on recent CPUs with
- * many cores.
- *
- * For multiple LLCs, allow an imbalance
- * until multiple tasks would share an LLC
- * on one node while LLCs on another node
- * remain idle. This assumes that there are
- * enough logical CPUs per LLC to avoid SMT
- * factors and that there is a correlation
- * between LLCs and memory channels.
- */
- nr_llcs = sd->span_weight / child->span_weight;
- if (nr_llcs == 1)
- imb = sd->span_weight >> 3;
- else
- imb = nr_llcs;
- imb = max(1U, imb);
- sd->imb_numa_nr = imb;
-
- /* Set span based on the first NUMA domain. */
- top_p = sd->parent;
- while (top_p && !(top_p->flags & SD_NUMA)) {
- top_p = top_p->parent;
- }
- imb_span = top_p ? top_p->span_weight : sd->span_weight;
- } else {
- int factor = max(1U, (sd->span_weight / imb_span));
+ /* First, find the topmost SD_SHARE_LLC domain */
+ while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+ sd = sd->parent;
- sd->imb_numa_nr = imb * factor;
- }
+ if (sd->flags & SD_SHARE_LLC) {
+ int sd_id = cpumask_first(sched_domain_span(sd));
+
+ sd->shared = *per_cpu_ptr(d.sds, sd_id);
+ atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+ atomic_inc(&sd->shared->ref);
+
+ /*
+ * In presence of higher domains, adjust the
+ * NUMA imbalance stats for the hierarchy.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && sd->parent)
+ adjust_numa_imbalance(sd);
}
}
@@ -2668,10 +2737,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (!cpumask_test_cpu(i, cpu_map))
continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- claim_allocations(i, sd);
+ claim_allocations(i, &d);
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent)
init_sched_groups_capacity(i, sd);
- }
}
/* Attach the domains */
diff --git a/kernel/signal.c b/kernel/signal.c
index d65d0fe24bfb..2d102e025883 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1000,9 +1000,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
* Found a killable thread. If the signal will be fatal,
* then start taking the whole group down immediately.
*/
- if (sig_fatal(p, sig) &&
- (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) &&
- !sigismember(&t->real_blocked, sig) &&
+ if (sig_fatal(p, sig) && !sigismember(&t->real_blocked, sig) &&
(sig == SIGKILL || !p->ptrace)) {
/*
* This signal will be fatal to the whole group.
@@ -2173,13 +2171,13 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
bool autoreap = false;
u64 utime, stime;
- WARN_ON_ONCE(sig == -1);
+ if (WARN_ON_ONCE(!valid_signal(sig)))
+ return false;
/* do_notify_parent_cldstop should have been called instead. */
WARN_ON_ONCE(task_is_stopped_or_traced(tsk));
- WARN_ON_ONCE(!tsk->ptrace &&
- (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ WARN_ON_ONCE(!tsk->ptrace && !thread_group_empty(tsk));
/* ptraced, or group-leader without sub-threads */
do_notify_pidfd(tsk);
@@ -2251,11 +2249,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
+ if (!tsk->ptrace && tsk->signal->autoreap) {
+ autoreap = true;
+ sig = 0;
+ }
/*
* Send with __send_signal as si_pid and si_uid are in the
* parent's namespaces.
*/
- if (valid_signal(sig) && sig)
+ if (sig)
__send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
@@ -2814,8 +2816,9 @@ bool get_signal(struct ksignal *ksig)
/*
* Do this once, we can't return to user-mode if freezing() == T.
- * do_signal_stop() and ptrace_stop() do freezable_schedule() and
- * thus do not need another check after return.
+ * do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED
+ * and the freezer handles those states via TASK_FROZEN, thus they
+ * do not need another check after return.
*/
try_to_freeze();
diff --git a/kernel/smp.c b/kernel/smp.c
index f349960f79ca..a0bb56bd8dda 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -215,7 +215,7 @@ static atomic_t n_csd_lock_stuck;
/**
* csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long?
*
- * Returns @true if a CSD-lock acquisition is stuck and has been stuck
+ * Returns: @true if a CSD-lock acquisition is stuck and has been stuck
* long enough for a "non-responsive CSD lock" message to be printed.
*/
bool csd_lock_is_stuck(void)
@@ -377,6 +377,20 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+static call_single_data_t *get_single_csd_data(int cpu)
+{
+ if (static_branch_unlikely(&csdlock_debug_enabled))
+ return per_cpu_ptr(&csd_data, cpu);
+ return this_cpu_ptr(&csd_data);
+}
+#else
+static call_single_data_t *get_single_csd_data(int cpu)
+{
+ return this_cpu_ptr(&csd_data);
+}
+#endif
+
void __smp_call_single_queue(int cpu, struct llist_node *node)
{
/*
@@ -394,7 +408,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
sched_ttwu_pending : csd->func;
- trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
+ trace_call__csd_queue_cpu(cpu, _RET_IP_, func, csd);
}
/*
@@ -625,13 +639,14 @@ void flush_smp_call_function_queue(void)
local_irq_restore(flags);
}
-/*
+/**
* smp_call_function_single - Run a function on a specific CPU
+ * @cpu: Specific target CPU for this function.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait until function has completed on other CPUs.
*
- * Returns 0 on success, else a negative status code.
+ * Returns: %0 on success, else a negative status code.
*/
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
@@ -670,14 +685,14 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd = &csd_stack;
if (!wait) {
- csd = this_cpu_ptr(&csd_data);
+ csd = get_single_csd_data(cpu);
csd_lock(csd);
}
csd->func = func;
csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
- csd->node.src = smp_processor_id();
+ csd->node.src = this_cpu;
csd->node.dst = cpu;
#endif
@@ -738,18 +753,18 @@ out:
}
EXPORT_SYMBOL_GPL(smp_call_function_single_async);
-/*
+/**
* smp_call_function_any - Run a function on any of the given cpus
* @mask: The mask of cpus it can run on.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait until function has completed.
*
- * Returns 0 on success, else a negative status code (if no cpus were online).
- *
* Selection preference:
* 1) current cpu if in @mask
* 2) nearest cpu in @mask, based on NUMA topology
+ *
+ * Returns: %0 on success, else a negative status code (if no cpus were online).
*/
int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait)
@@ -832,7 +847,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd->func = func;
csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
- csd->node.src = smp_processor_id();
+ csd->node.src = this_cpu;
csd->node.dst = cpu;
#endif
trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
@@ -880,7 +895,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
}
/**
- * smp_call_function_many(): Run a function on a set of CPUs.
+ * smp_call_function_many() - Run a function on a set of CPUs.
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
@@ -902,14 +917,12 @@ void smp_call_function_many(const struct cpumask *mask,
EXPORT_SYMBOL(smp_call_function_many);
/**
- * smp_call_function(): Run a function on all other CPUs.
+ * smp_call_function() - Run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait (atomically) until function has completed
* on other CPUs.
*
- * Returns 0.
- *
* If @wait is true, then returns once @func has returned; otherwise
* it returns just before the target cpu calls @func.
*
@@ -1009,8 +1022,8 @@ void __init smp_init(void)
smp_cpus_done(setup_max_cpus);
}
-/*
- * on_each_cpu_cond(): Call a function on each processor for which
+/**
+ * on_each_cpu_cond_mask() - Call a function on each processor for which
* the supplied function cond_func returns true, optionally waiting
* for all the required CPUs to finish. This may include the local
* processor.
@@ -1024,6 +1037,7 @@ void __init smp_init(void)
* @info: An arbitrary pointer to pass to both functions.
* @wait: If true, wait (atomically) until function has
* completed on other CPUs.
+ * @mask: The set of cpus to run on (only runs on online subset).
*
* Preemption is disabled to protect against CPUs going offline but not online.
* CPUs going online during the call will not be seen or sent an IPI.
@@ -1095,7 +1109,7 @@ EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
* scheduled, for any of the CPUs in the @mask. It does not guarantee
* correctness as it only provides a racy snapshot.
*
- * Returns true if there is a pending IPI scheduled and false otherwise.
+ * Returns: true if there is a pending IPI scheduled and false otherwise.
*/
bool cpus_peek_for_pending_ipi(const struct cpumask *mask)
{
@@ -1145,6 +1159,18 @@ static void smp_call_on_cpu_callback(struct work_struct *work)
complete(&sscs->done);
}
+/**
+ * smp_call_on_cpu() - Call a function on a specific CPU and wait
+ * for it to return.
+ * @cpu: The CPU to run on.
+ * @func: The function to run
+ * @par: An arbitrary pointer parameter for @func.
+ * @phys: If @true, force to run on physical @cpu. See
+ * &struct smp_call_on_cpu_struct for more info.
+ *
+ * Returns: %-ENXIO if the @cpu is invalid; otherwise the return value
+ * from @func.
+ */
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
struct smp_call_on_cpu_struct sscs = {
@@ -1159,7 +1185,7 @@ int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO;
- queue_work_on(cpu, system_wq, &sscs.work);
+ queue_work_on(cpu, system_percpu_wq, &sscs.work);
wait_for_completion(&sscs.done);
destroy_work_on_stack(&sscs.work);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 77198911b8dd..4425d8dce44b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -663,6 +663,13 @@ void irq_enter_rcu(void)
{
__irq_enter_raw();
+ /*
+ * If this is a nested interrupt that hits the exit_to_user_mode_loop
+ * where it has enabled interrupts but before it has hit schedule() we
+ * could have hrtimers in an undefined state. Fix it up here.
+ */
+ hrtimer_rearm_deferred();
+
if (tick_nohz_full_cpu(smp_processor_id()) ||
(is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)))
tick_irq_enter();
@@ -719,8 +726,14 @@ static inline void __irq_exit_rcu(void)
#endif
account_hardirq_exit(current);
preempt_count_sub(HARDIRQ_OFFSET);
- if (!in_interrupt() && local_softirq_pending())
+ if (!in_interrupt() && local_softirq_pending()) {
+ /*
+ * If we left hrtimers unarmed, make sure to arm them now,
+ * before enabling interrupts to run SoftIRQ.
+ */
+ hrtimer_rearm_deferred();
invoke_softirq();
+ }
if (IS_ENABLED(CONFIG_IRQ_FORCED_THREADING) && force_irqthreads() &&
local_timers_pending_force_th() && !(in_nmi() | in_hardirq()))
diff --git a/kernel/sys.c b/kernel/sys.c
index c86eba9aa7e9..62e842055cc9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2388,17 +2388,18 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st
return -EINVAL;
}
-int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status)
+int __weak arch_prctl_get_branch_landing_pad_state(struct task_struct *t,
+ unsigned long __user *state)
{
return -EINVAL;
}
-int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status)
+int __weak arch_prctl_set_branch_landing_pad_state(struct task_struct *t, unsigned long state)
{
return -EINVAL;
}
-int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status)
+int __weak arch_prctl_lock_branch_landing_pad_state(struct task_struct *t)
{
return -EINVAL;
}
@@ -2888,20 +2889,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = rseq_slice_extension_prctl(arg2, arg3);
break;
- case PR_GET_INDIR_BR_LP_STATUS:
- if (arg3 || arg4 || arg5)
+ case PR_GET_CFI:
+ if (arg2 != PR_CFI_BRANCH_LANDING_PADS)
return -EINVAL;
- error = arch_get_indir_br_lp_status(me, (unsigned long __user *)arg2);
- break;
- case PR_SET_INDIR_BR_LP_STATUS:
- if (arg3 || arg4 || arg5)
+ if (arg4 || arg5)
return -EINVAL;
- error = arch_set_indir_br_lp_status(me, arg2);
+ error = arch_prctl_get_branch_landing_pad_state(me, (unsigned long __user *)arg3);
break;
- case PR_LOCK_INDIR_BR_LP_STATUS:
- if (arg3 || arg4 || arg5)
+ case PR_SET_CFI:
+ if (arg2 != PR_CFI_BRANCH_LANDING_PADS)
return -EINVAL;
- error = arch_lock_indir_br_lp_status(me, arg2);
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_set_branch_landing_pad_state(me, arg3);
+ if (error)
+ break;
+ if (arg3 & PR_CFI_LOCK && !(arg3 & PR_CFI_DISABLE))
+ error = arch_prctl_lock_branch_landing_pad_state(me);
break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d3a666ffde1..c9efb17cc255 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1118,7 +1118,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int dir,
unsigned long bitmap_len = table->maxlen;
unsigned long *bitmap = *(unsigned long **) table->data;
unsigned long *tmp_bitmap = NULL;
- char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+ char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c = 0;
if (!bitmap || !bitmap_len || !left || (*ppos && SYSCTL_KERN_TO_USER(dir))) {
*lenp = 0;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 0cd680ccc7e5..73bd6a6a7893 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -649,6 +649,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
goto err;
memcpy(stats, tsk->signal->stats, sizeof(*stats));
+ stats->version = TASKSTATS_VERSION;
send:
send_cpu_listeners(rep_skb, listeners);
diff --git a/kernel/time/.kunitconfig b/kernel/time/.kunitconfig
new file mode 100644
index 000000000000..d60a611b2853
--- /dev/null
+++ b/kernel/time/.kunitconfig
@@ -0,0 +1,2 @@
+CONFIG_KUNIT=y
+CONFIG_TIME_KUNIT_TEST=y
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 7c6a52f7836c..02aac7c5aa76 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -9,14 +9,13 @@
config CLOCKSOURCE_WATCHDOG
bool
-# Architecture has extra clocksource data
-config ARCH_CLOCKSOURCE_DATA
- bool
-
# Architecture has extra clocksource init called from registration
config ARCH_CLOCKSOURCE_INIT
bool
+config ARCH_WANTS_CLOCKSOURCE_READ_INLINE
+ bool
+
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
@@ -44,10 +43,23 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE
config GENERIC_CLOCKEVENTS_MIN_ADJUST
bool
+config GENERIC_CLOCKEVENTS_COUPLED
+ bool
+
+config GENERIC_CLOCKEVENTS_COUPLED_INLINE
+ select GENERIC_CLOCKEVENTS_COUPLED
+ bool
+
# Generic update of CMOS clock
config GENERIC_CMOS_UPDATE
bool
+# Deferred rearming of the hrtimer interrupt
+config HRTIMER_REARM_DEFERRED
+ def_bool y
+ depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS
+ depends on HIGH_RES_TIMERS && SCHED_HRTICK
+
# Select to handle posix CPU timers from task_work
# and not from the timer interrupt context
config HAVE_POSIX_CPU_TIMERS_TASK_WORK
@@ -196,18 +208,6 @@ config HIGH_RES_TIMERS
hardware is not capable then this option only increases
the size of the kernel image.
-config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
- int "Clocksource watchdog maximum allowable skew (in microseconds)"
- depends on CLOCKSOURCE_WATCHDOG
- range 50 1000
- default 125
- help
- Specify the maximum amount of allowable watchdog skew in
- microseconds before reporting the clocksource to be unstable.
- The default is based on a half-second clocksource watchdog
- interval and NTP's maximum frequency drift of 500 parts
- per million. If the clocksource is good enough for NTP,
- it is good enough for the clocksource watchdog!
endif
config POSIX_AUX_CLOCKS
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f7d52d9543cc..eaf290c972f9 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -30,5 +30,6 @@ obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
+obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o
obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 069d93bfb0c7..6e173d70d825 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -234,19 +234,23 @@ static int alarmtimer_suspend(struct device *dev)
if (!rtc)
return 0;
- /* Find the soonest timer to expire*/
+ /* Find the soonest timer to expire */
for (i = 0; i < ALARM_NUMTYPE; i++) {
struct alarm_base *base = &alarm_bases[i];
struct timerqueue_node *next;
+ ktime_t next_expires;
ktime_t delta;
- scoped_guard(spinlock_irqsave, &base->lock)
+ scoped_guard(spinlock_irqsave, &base->lock) {
next = timerqueue_getnext(&base->timerqueue);
+ if (next)
+ next_expires = next->expires;
+ }
if (!next)
continue;
- delta = ktime_sub(next->expires, base->get_ktime());
+ delta = ktime_sub(next_expires, base->get_ktime());
if (!min || (delta < min)) {
- expires = next->expires;
+ expires = next_expires;
min = delta;
type = i;
}
@@ -540,7 +544,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return alarm_forward(alarm, timr->it_interval, now);
+ return alarm_forward(alarm, now, timr->it_interval);
}
/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index eaae1ce9f060..b4d730604972 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -172,6 +172,7 @@ void clockevents_shutdown(struct clock_event_device *dev)
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
}
/**
@@ -292,6 +293,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE
+#include <asm/clock_inlined.h>
+#else
+static __always_inline void
+arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { }
+#endif
+
+static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
+{
+ u64 cycles;
+
+ if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED)))
+ return false;
+
+ if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE))
+ arch_inlined_clockevent_set_next_coupled(cycles, dev);
+ else
+ dev->set_next_coupled(cycles, dev);
+ return true;
+}
+
+#else
+static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
+{
+ return false;
+}
+#endif
+
/**
* clockevents_program_event - Reprogram the clock event device.
* @dev: device to program
@@ -300,12 +333,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
*
* Returns 0 on success, -ETIME when the event is in the past.
*/
-int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
- bool force)
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force)
{
- unsigned long long clc;
int64_t delta;
- int rc;
+ u64 cycles;
if (WARN_ON_ONCE(expires < 0))
return -ETIME;
@@ -319,21 +350,35 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
clockevent_get_state(dev));
- /* Shortcut for clockevent devices that can deal with ktime. */
- if (dev->features & CLOCK_EVT_FEAT_KTIME)
+ /* ktime_t based reprogramming for the broadcast hrtimer device */
+ if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER))
return dev->set_next_ktime(expires, dev);
+ if (likely(clockevent_set_next_coupled(dev, expires)))
+ return 0;
+
delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
- if (delta <= 0)
- return force ? clockevents_program_min_delta(dev) : -ETIME;
- delta = min(delta, (int64_t) dev->max_delta_ns);
- delta = max(delta, (int64_t) dev->min_delta_ns);
+ /* Required for tick_periodic() during early boot */
+ if (delta <= 0 && !force)
+ return -ETIME;
+
+ if (delta > (int64_t)dev->min_delta_ns) {
+ delta = min(delta, (int64_t) dev->max_delta_ns);
+ cycles = ((u64)delta * dev->mult) >> dev->shift;
+ if (!dev->set_next_event((unsigned long) cycles, dev))
+ return 0;
+ }
- clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
- rc = dev->set_next_event((unsigned long) clc, dev);
+ if (dev->next_event_forced)
+ return 0;
- return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+ if (dev->set_next_event(dev->min_delta_ticks, dev)) {
+ if (!force || clockevents_program_min_delta(dev))
+ return -ETIME;
+ }
+ dev->next_event_forced = 1;
+ return 0;
}
/*
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index 38dae590b29f..b4cf17b4aeed 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -3,202 +3,196 @@
* Unit test for the clocksource watchdog.
*
* Copyright (C) 2021 Facebook, Inc.
+ * Copyright (C) 2026 Intel Corp.
*
* Author: Paul E. McKenney <paulmck@kernel.org>
+ * Author: Thomas Gleixner <tglx@kernel.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/device.h>
#include <linux/clocksource.h>
-#include <linux/init.h>
+#include <linux/delay.h>
#include <linux/module.h>
-#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
-#include <linux/tick.h>
#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/prandom.h>
-#include <linux/cpu.h>
#include "tick-internal.h"
+#include "timekeeping_internal.h"
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clocksource watchdog unit test");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+MODULE_AUTHOR("Thomas Gleixner <tglx@kernel.org>");
+
+enum wdtest_states {
+ WDTEST_INJECT_NONE,
+ WDTEST_INJECT_DELAY,
+ WDTEST_INJECT_POSITIVE,
+ WDTEST_INJECT_NEGATIVE,
+ WDTEST_INJECT_PERCPU = 0x100,
+};
-static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
-module_param(holdoff, int, 0444);
-MODULE_PARM_DESC(holdoff, "Time to wait to start test (s).");
+static enum wdtest_states wdtest_state;
+static unsigned long wdtest_test_count;
+static ktime_t wdtest_last_ts, wdtest_offset;
-/* Watchdog kthread's task_struct pointer for debug purposes. */
-static struct task_struct *wdtest_task;
+#define SHIFT_4000PPM 8
-static u64 wdtest_jiffies_read(struct clocksource *cs)
+static ktime_t wdtest_get_offset(struct clocksource *cs)
{
- return (u64)jiffies;
-}
-
-static struct clocksource clocksource_wdtest_jiffies = {
- .name = "wdtest-jiffies",
- .rating = 1, /* lowest valid rating*/
- .uncertainty_margin = TICK_NSEC,
- .read = wdtest_jiffies_read,
- .mask = CLOCKSOURCE_MASK(32),
- .flags = CLOCK_SOURCE_MUST_VERIFY,
- .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
- .shift = JIFFIES_SHIFT,
- .max_cycles = 10,
-};
+ if (wdtest_state < WDTEST_INJECT_PERCPU)
+ return wdtest_test_count & 0x1 ? 0 : wdtest_offset >> SHIFT_4000PPM;
-static int wdtest_ktime_read_ndelays;
-static bool wdtest_ktime_read_fuzz;
+ /* Only affect the readout of the "remote" CPU */
+ return cs->wd_cpu == smp_processor_id() ? 0 : NSEC_PER_MSEC;
+}
static u64 wdtest_ktime_read(struct clocksource *cs)
{
- int wkrn = READ_ONCE(wdtest_ktime_read_ndelays);
- static int sign = 1;
- u64 ret;
+ ktime_t now = ktime_get_raw_fast_ns();
+ ktime_t intv = now - wdtest_last_ts;
- if (wkrn) {
- udelay(cs->uncertainty_margin / 250);
- WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1);
- }
- ret = ktime_get_real_fast_ns();
- if (READ_ONCE(wdtest_ktime_read_fuzz)) {
- sign = -sign;
- ret = ret + sign * 100 * NSEC_PER_MSEC;
+ /*
+ * Only increment the test counter once per watchdog interval and
+ * store the interval for the offset calculation of this step. This
+ * guarantees a consistent behaviour even if the other side needs
+ * to repeat due to a watchdog read timeout.
+ */
+ if (intv > (NSEC_PER_SEC / 4)) {
+ WRITE_ONCE(wdtest_test_count, wdtest_test_count + 1);
+ wdtest_last_ts = now;
+ wdtest_offset = intv;
}
- return ret;
-}
-static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs)
-{
- pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name);
+ switch (wdtest_state & ~WDTEST_INJECT_PERCPU) {
+ case WDTEST_INJECT_POSITIVE:
+ return now + wdtest_get_offset(cs);
+ case WDTEST_INJECT_NEGATIVE:
+ return now - wdtest_get_offset(cs);
+ case WDTEST_INJECT_DELAY:
+ udelay(500);
+ return now;
+ default:
+ return now;
+ }
}
-#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
- CLOCK_SOURCE_VALID_FOR_HRES | \
- CLOCK_SOURCE_MUST_VERIFY | \
- CLOCK_SOURCE_VERIFY_PERCPU)
+#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
+ CLOCK_SOURCE_CALIBRATED | \
+ CLOCK_SOURCE_MUST_VERIFY | \
+ CLOCK_SOURCE_WDTEST)
static struct clocksource clocksource_wdtest_ktime = {
.name = "wdtest-ktime",
- .rating = 300,
+ .rating = 10,
.read = wdtest_ktime_read,
.mask = CLOCKSOURCE_MASK(64),
.flags = KTIME_FLAGS,
- .mark_unstable = wdtest_ktime_cs_mark_unstable,
.list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list),
};
-/* Reset the clocksource if needed. */
-static void wdtest_ktime_clocksource_reset(void)
+static void wdtest_clocksource_reset(enum wdtest_states which, bool percpu)
+{
+ clocksource_unregister(&clocksource_wdtest_ktime);
+
+ pr_info("Test: State %d percpu %d\n", which, percpu);
+
+ wdtest_state = which;
+ if (percpu)
+ wdtest_state |= WDTEST_INJECT_PERCPU;
+ wdtest_test_count = 0;
+ wdtest_last_ts = 0;
+
+ clocksource_wdtest_ktime.rating = 10;
+ clocksource_wdtest_ktime.flags = KTIME_FLAGS;
+ if (percpu)
+ clocksource_wdtest_ktime.flags |= CLOCK_SOURCE_WDTEST_PERCPU;
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+}
+
+static bool wdtest_execute(enum wdtest_states which, bool percpu, unsigned int expect,
+ unsigned long calls)
{
- if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) {
- clocksource_unregister(&clocksource_wdtest_ktime);
- clocksource_wdtest_ktime.flags = KTIME_FLAGS;
- schedule_timeout_uninterruptible(HZ / 10);
- clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ wdtest_clocksource_reset(which, percpu);
+
+ for (; READ_ONCE(wdtest_test_count) < calls; msleep(100)) {
+ unsigned int flags = READ_ONCE(clocksource_wdtest_ktime.flags);
+
+ if (kthread_should_stop())
+ return false;
+
+ if (flags & CLOCK_SOURCE_UNSTABLE) {
+ if (expect & CLOCK_SOURCE_UNSTABLE)
+ return true;
+ pr_warn("Fail: Unexpected unstable\n");
+ return false;
+ }
+ if (flags & CLOCK_SOURCE_VALID_FOR_HRES) {
+ if (expect & CLOCK_SOURCE_VALID_FOR_HRES)
+ return true;
+ pr_warn("Fail: Unexpected valid for highres\n");
+ return false;
+ }
}
+
+ if (!expect)
+ return true;
+
+ pr_warn("Fail: Timed out\n");
+ return false;
}
-/* Run the specified series of watchdog tests. */
-static int wdtest_func(void *arg)
+static bool wdtest_run(bool percpu)
{
- unsigned long j1, j2;
- int i, max_retries;
- char *s;
+ if (!wdtest_execute(WDTEST_INJECT_NONE, percpu, CLOCK_SOURCE_VALID_FOR_HRES, 8))
+ return false;
- schedule_timeout_uninterruptible(holdoff * HZ);
+ if (!wdtest_execute(WDTEST_INJECT_DELAY, percpu, 0, 4))
+ return false;
- /*
- * Verify that jiffies-like clocksources get the manually
- * specified uncertainty margin.
- */
- pr_info("--- Verify jiffies-like uncertainty margin.\n");
- __clocksource_register(&clocksource_wdtest_jiffies);
- WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC);
+ if (!wdtest_execute(WDTEST_INJECT_POSITIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8))
+ return false;
- j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
- schedule_timeout_uninterruptible(HZ);
- j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
- WARN_ON_ONCE(j1 == j2);
+ if (!wdtest_execute(WDTEST_INJECT_NEGATIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8))
+ return false;
- clocksource_unregister(&clocksource_wdtest_jiffies);
+ return true;
+}
- /*
- * Verify that tsc-like clocksources are assigned a reasonable
- * uncertainty margin.
- */
- pr_info("--- Verify tsc-like uncertainty margin.\n");
+static int wdtest_func(void *arg)
+{
clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
- WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC);
-
- j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
- udelay(1);
- j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
- pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
- WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC),
- "Expected at least 1000ns, got %lu.\n", j2 - j1);
-
- /* Verify tsc-like stability with various numbers of errors injected. */
- max_retries = clocksource_get_max_watchdog_retry();
- for (i = 0; i <= max_retries + 1; i++) {
- if (i <= 1 && i < max_retries)
- s = "";
- else if (i <= max_retries)
- s = ", expect message";
- else
- s = ", expect clock skew";
- pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s);
- WRITE_ONCE(wdtest_ktime_read_ndelays, i);
- schedule_timeout_uninterruptible(2 * HZ);
- WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
- WARN_ON_ONCE((i <= max_retries) !=
- !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
- wdtest_ktime_clocksource_reset();
+ if (wdtest_run(false)) {
+ if (wdtest_run(true))
+ pr_info("Success: All tests passed\n");
}
-
- /* Verify tsc-like stability with clock-value-fuzz error injection. */
- pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n");
- WRITE_ONCE(wdtest_ktime_read_fuzz, true);
- schedule_timeout_uninterruptible(2 * HZ);
- WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
- clocksource_verify_percpu(&clocksource_wdtest_ktime);
- WRITE_ONCE(wdtest_ktime_read_fuzz, false);
-
clocksource_unregister(&clocksource_wdtest_ktime);
- pr_info("--- Done with test.\n");
- return 0;
-}
+ if (!IS_MODULE(CONFIG_TEST_CLOCKSOURCE_WATCHDOG))
+ return 0;
-static void wdtest_print_module_parms(void)
-{
- pr_alert("--- holdoff=%d\n", holdoff);
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(3600 * HZ);
+ return 0;
}
-/* Cleanup function. */
-static void clocksource_wdtest_cleanup(void)
-{
-}
+static struct task_struct *wdtest_thread;
static int __init clocksource_wdtest_init(void)
{
- int ret = 0;
-
- wdtest_print_module_parms();
+ struct task_struct *t = kthread_run(wdtest_func, NULL, "wdtest");
- /* Create watchdog-test task. */
- wdtest_task = kthread_run(wdtest_func, NULL, "wdtest");
- if (IS_ERR(wdtest_task)) {
- ret = PTR_ERR(wdtest_task);
- pr_warn("%s: Failed to create wdtest kthread.\n", __func__);
- wdtest_task = NULL;
- return ret;
+ if (IS_ERR(t)) {
+ pr_warn("Failed to create wdtest kthread.\n");
+ return PTR_ERR(t);
}
-
+ wdtest_thread = t;
return 0;
}
-
module_init(clocksource_wdtest_init);
+
+static void clocksource_wdtest_cleanup(void)
+{
+ if (wdtest_thread)
+ kthread_stop(wdtest_thread);
+}
module_exit(clocksource_wdtest_cleanup);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index df7194961658..baee13a1f87f 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -7,15 +7,17 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/device.h>
#include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/device.h>
#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
-#include <linux/tick.h>
#include <linux/kthread.h>
+#include <linux/module.h>
#include <linux/prandom.h>
-#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/topology.h>
#include "tick-internal.h"
#include "timekeeping_internal.h"
@@ -107,48 +109,6 @@ static char override_name[CS_NAME_LEN];
static int finished_booting;
static u64 suspend_start;
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
-
-/*
- * Threshold: 0.0312s, when doubled: 0.0625s.
- */
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
-
-/*
- * Maximum permissible delay between two readouts of the watchdog
- * clocksource surrounding a read of the clocksource being validated.
- * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
- * a lower bound for cs->uncertainty_margin values when registering clocks.
- *
- * The default of 500 parts per million is based on NTP's limits.
- * If a clocksource is good enough for NTP, it is good enough for us!
- *
- * In other words, by default, even if a clocksource is extremely
- * precise (for example, with a sub-nanosecond period), the maximum
- * permissible skew between the clocksource watchdog and the clocksource
- * under test is not permitted to go below the 500ppm minimum defined
- * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the
- * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option.
- */
-#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-#else
-#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
-#endif
-
-/*
- * Default for maximum permissible skew when cs->uncertainty_margin is
- * not specified, and the lower bound even when cs->uncertainty_margin
- * is specified. This is also the default that is used when registering
- * clocks with unspecified cs->uncertainty_margin, so this macro is used
- * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
- */
-#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
-
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
static void clocksource_select(void);
@@ -160,7 +120,42 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
-static int64_t watchdog_max_interval;
+
+/* Watchdog interval: 0.5sec. */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_NS (WATCHDOG_INTERVAL * (NSEC_PER_SEC / HZ))
+
+/* Maximum time between two reference watchdog readouts */
+#define WATCHDOG_READOUT_MAX_NS (50U * NSEC_PER_USEC)
+
+/*
+ * Maximum time between two remote readouts for NUMA=n. On NUMA enabled systems
+ * the timeout is calculated from the numa distance.
+ */
+#define WATCHDOG_DEFAULT_TIMEOUT_NS (50U * NSEC_PER_USEC)
+
+/*
+ * Remote timeout NUMA distance multiplier. The local distance is 10. The
+ * default remote distance is 20. ACPI tables provide more accurate numbers
+ * which are guaranteed to be greater than the local distance.
+ *
+ * This results in a 5us base value, which is equivalent to the above !NUMA
+ * default.
+ */
+#define WATCHDOG_NUMA_MULTIPLIER_NS ((u64)(WATCHDOG_DEFAULT_TIMEOUT_NS / LOCAL_DISTANCE))
+
+/* Limit the NUMA timeout in case the distance values are insanely big */
+#define WATCHDOG_NUMA_MAX_TIMEOUT_NS ((u64)(500U * NSEC_PER_USEC))
+
+/* Shift values to calculate the approximate $N ppm of a given delta. */
+#define SHIFT_500PPM 11
+#define SHIFT_4000PPM 8
+
+/* Number of attempts to read the watchdog */
+#define WATCHDOG_FREQ_RETRIES 3
+
+/* Five reads local and remote for inter CPU skew detection */
+#define WATCHDOG_REMOTE_MAX_SEQ 10
static inline void clocksource_watchdog_lock(unsigned long *flags)
{
@@ -241,204 +236,422 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static int verify_n_cpus = 8;
-module_param(verify_n_cpus, int, 0644);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
-enum wd_read_status {
- WD_READ_SUCCESS,
- WD_READ_UNSTABLE,
- WD_READ_SKIP
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+enum wd_result {
+ WD_SUCCESS,
+ WD_FREQ_NO_WATCHDOG,
+ WD_FREQ_TIMEOUT,
+ WD_FREQ_RESET,
+ WD_FREQ_SKEWED,
+ WD_CPU_TIMEOUT,
+ WD_CPU_SKEWED,
+};
+
+struct watchdog_cpu_data {
+ /* Keep first as it is 32 byte aligned */
+ call_single_data_t csd;
+ atomic_t remote_inprogress;
+ enum wd_result result;
+ u64 cpu_ts[2];
+ struct clocksource *cs;
+ /* Ensure that the sequence is in a separate cache line */
+ atomic_t seq ____cacheline_aligned;
+ /* Set by the control CPU according to NUMA distance */
+ u64 timeout_ns;
};
-static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
-{
- int64_t md = watchdog->uncertainty_margin;
- unsigned int nretries, max_retries;
- int64_t wd_delay, wd_seq_delay;
- u64 wd_end, wd_end2;
-
- max_retries = clocksource_get_max_watchdog_retry();
- for (nretries = 0; nretries <= max_retries; nretries++) {
- local_irq_disable();
- *wdnow = watchdog->read(watchdog);
- *csnow = cs->read(cs);
- wd_end = watchdog->read(watchdog);
- wd_end2 = watchdog->read(watchdog);
- local_irq_enable();
-
- wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
- if (wd_delay <= md + cs->uncertainty_margin) {
- if (nretries > 1 && nretries >= max_retries) {
- pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
- smp_processor_id(), watchdog->name, nretries);
+struct watchdog_data {
+ raw_spinlock_t lock;
+ enum wd_result result;
+
+ u64 wd_seq;
+ u64 wd_delta;
+ u64 cs_delta;
+ u64 cpu_ts[2];
+
+ unsigned int curr_cpu;
+} ____cacheline_aligned_in_smp;
+
+static void watchdog_check_skew_remote(void *unused);
+
+static DEFINE_PER_CPU_ALIGNED(struct watchdog_cpu_data, watchdog_cpu_data) = {
+ .csd = CSD_INIT(watchdog_check_skew_remote, NULL),
+};
+
+static struct watchdog_data watchdog_data = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(watchdog_data.lock),
+};
+
+static inline void watchdog_set_result(struct watchdog_cpu_data *wd, enum wd_result result)
+{
+ guard(raw_spinlock)(&watchdog_data.lock);
+ if (!wd->result) {
+ atomic_set(&wd->seq, WATCHDOG_REMOTE_MAX_SEQ);
+ WRITE_ONCE(wd->result, result);
+ }
+}
+
+/* Wait for the sequence number to hand over control. */
+static bool watchdog_wait_seq(struct watchdog_cpu_data *wd, u64 start, int seq)
+{
+ for(int cnt = 0; atomic_read(&wd->seq) < seq; cnt++) {
+ /* Bail if the other side set an error result */
+ if (READ_ONCE(wd->result) != WD_SUCCESS)
+ return false;
+
+ /* Prevent endless loops if the other CPU does not react. */
+ if (cnt == 5000) {
+ u64 nsecs = ktime_get_raw_fast_ns();
+
+ if (nsecs - start >=wd->timeout_ns) {
+ watchdog_set_result(wd, WD_CPU_TIMEOUT);
+ return false;
}
- return WD_READ_SUCCESS;
+ cnt = 0;
}
+ cpu_relax();
+ }
+ return seq < WATCHDOG_REMOTE_MAX_SEQ;
+}
- /*
- * Now compute delay in consecutive watchdog read to see if
- * there is too much external interferences that cause
- * significant delay in reading both clocksource and watchdog.
- *
- * If consecutive WD read-back delay > md, report
- * system busy, reinit the watchdog and skip the current
- * watchdog test.
- */
- wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
- if (wd_seq_delay > md)
- goto skip_test;
+static void watchdog_check_skew(struct watchdog_cpu_data *wd, int index)
+{
+ u64 prev, now, delta, start = ktime_get_raw_fast_ns();
+ int local = index, remote = (index + 1) & 0x1;
+ struct clocksource *cs = wd->cs;
+
+ /* Set the local timestamp so that the first iteration works correctly */
+ wd->cpu_ts[local] = cs->read(cs);
+
+ /* Signal arrival */
+ atomic_inc(&wd->seq);
+
+ for (int seq = local + 2; seq < WATCHDOG_REMOTE_MAX_SEQ; seq += 2) {
+ if (!watchdog_wait_seq(wd, start, seq))
+ return;
+
+ /* Capture local timestamp before possible non-local coherency overhead */
+ now = cs->read(cs);
+
+ /* Store local timestamp before reading remote to limit coherency stalls */
+ wd->cpu_ts[local] = now;
+
+ prev = wd->cpu_ts[remote];
+ delta = (now - prev) & cs->mask;
+
+ if (delta > cs->max_raw_delta) {
+ watchdog_set_result(wd, WD_CPU_SKEWED);
+ return;
+ }
+
+ /* Hand over to the remote CPU */
+ atomic_inc(&wd->seq);
}
+}
- pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
- smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
- return WD_READ_UNSTABLE;
+static void watchdog_check_skew_remote(void *unused)
+{
+ struct watchdog_cpu_data *wd = this_cpu_ptr(&watchdog_cpu_data);
-skip_test:
- pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
- smp_processor_id(), watchdog->name, wd_seq_delay);
- pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
- cs->name, wd_delay);
- return WD_READ_SKIP;
+ atomic_inc(&wd->remote_inprogress);
+ watchdog_check_skew(wd, 1);
+ atomic_dec(&wd->remote_inprogress);
}
-static u64 csnow_mid;
-static cpumask_t cpus_ahead;
-static cpumask_t cpus_behind;
-static cpumask_t cpus_chosen;
+static inline bool wd_csd_locked(struct watchdog_cpu_data *wd)
+{
+ return READ_ONCE(wd->csd.node.u_flags) & CSD_FLAG_LOCK;
+}
+
+/*
+ * This is only invoked for remote CPUs. See watchdog_check_cpu_skew().
+ */
+static inline u64 wd_get_remote_timeout(unsigned int remote_cpu)
+{
+ unsigned int n1, n2;
+ u64 ns;
+
+ if (nr_node_ids == 1)
+ return WATCHDOG_DEFAULT_TIMEOUT_NS;
+
+ n1 = cpu_to_node(smp_processor_id());
+ n2 = cpu_to_node(remote_cpu);
+ ns = WATCHDOG_NUMA_MULTIPLIER_NS * node_distance(n1, n2);
+ return min(ns, WATCHDOG_NUMA_MAX_TIMEOUT_NS);
+}
-static void clocksource_verify_choose_cpus(void)
+static void __watchdog_check_cpu_skew(struct clocksource *cs, unsigned int cpu)
{
- int cpu, i, n = verify_n_cpus;
+ struct watchdog_cpu_data *wd;
- if (n < 0 || n >= num_online_cpus()) {
- /* Check all of the CPUs. */
- cpumask_copy(&cpus_chosen, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ wd = per_cpu_ptr(&watchdog_cpu_data, cpu);
+ if (atomic_read(&wd->remote_inprogress) || wd_csd_locked(wd)) {
+ watchdog_data.result = WD_CPU_TIMEOUT;
return;
}
- /* If no checking desired, or no other CPU to check, leave. */
- cpumask_clear(&cpus_chosen);
- if (n == 0 || num_online_cpus() <= 1)
+ atomic_set(&wd->seq, 0);
+ wd->result = WD_SUCCESS;
+ wd->cs = cs;
+ /* Store the current CPU ID for the watchdog test unit */
+ cs->wd_cpu = smp_processor_id();
+
+ wd->timeout_ns = wd_get_remote_timeout(cpu);
+
+ /* Kick the remote CPU into the watchdog function */
+ if (WARN_ON_ONCE(smp_call_function_single_async(cpu, &wd->csd))) {
+ watchdog_data.result = WD_CPU_TIMEOUT;
+ return;
+ }
+
+ scoped_guard(irq)
+ watchdog_check_skew(wd, 0);
+
+ scoped_guard(raw_spinlock_irq, &watchdog_data.lock) {
+ watchdog_data.result = wd->result;
+ memcpy(watchdog_data.cpu_ts, wd->cpu_ts, sizeof(wd->cpu_ts));
+ }
+}
+
+static void watchdog_check_cpu_skew(struct clocksource *cs)
+{
+ unsigned int cpu = watchdog_data.curr_cpu;
+
+ cpu = cpumask_next_wrap(cpu, cpu_online_mask);
+ watchdog_data.curr_cpu = cpu;
+
+ /* Skip the current CPU. Handles num_online_cpus() == 1 as well */
+ if (cpu == smp_processor_id())
return;
- /* Make sure to select at least one CPU other than the current CPU. */
- cpu = cpumask_any_but(cpu_online_mask, smp_processor_id());
- if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ /* Don't interfere with the test mechanics */
+ if ((cs->flags & CLOCK_SOURCE_WDTEST) && !(cs->flags & CLOCK_SOURCE_WDTEST_PERCPU))
return;
- cpumask_set_cpu(cpu, &cpus_chosen);
- /* Force a sane value for the boot parameter. */
- if (n > nr_cpu_ids)
- n = nr_cpu_ids;
+ __watchdog_check_cpu_skew(cs, cpu);
+}
+
+static bool watchdog_check_freq(struct clocksource *cs, bool reset_pending)
+{
+ unsigned int ppm_shift = SHIFT_4000PPM;
+ u64 wd_ts0, wd_ts1, cs_ts;
+
+ watchdog_data.result = WD_SUCCESS;
+ if (!watchdog) {
+ watchdog_data.result = WD_FREQ_NO_WATCHDOG;
+ return false;
+ }
+
+ if (cs->flags & CLOCK_SOURCE_WDTEST_PERCPU)
+ return true;
/*
- * Randomly select the specified number of CPUs. If the same
- * CPU is selected multiple times, that CPU is checked only once,
- * and no replacement CPU is selected. This gracefully handles
- * situations where verify_n_cpus is greater than the number of
- * CPUs that are currently online.
+ * If both the clocksource and the watchdog claim they are
+ * calibrated use 500ppm limit. Uncalibrated clocksources need a
+ * larger allowance because thefirmware supplied frequencies can be
+ * way off.
*/
- for (i = 1; i < n; i++) {
- cpu = cpumask_random(cpu_online_mask);
- if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
- cpumask_set_cpu(cpu, &cpus_chosen);
+ if (watchdog->flags & CLOCK_SOURCE_CALIBRATED && cs->flags & CLOCK_SOURCE_CALIBRATED)
+ ppm_shift = SHIFT_500PPM;
+
+ for (int retries = 0; retries < WATCHDOG_FREQ_RETRIES; retries++) {
+ s64 wd_last, cs_last, wd_seq, wd_delta, cs_delta, max_delta;
+
+ scoped_guard(irq) {
+ wd_ts0 = watchdog->read(watchdog);
+ cs_ts = cs->read(cs);
+ wd_ts1 = watchdog->read(watchdog);
+ }
+
+ wd_last = cs->wd_last;
+ cs_last = cs->cs_last;
+
+ /* Validate the watchdog readout window */
+ wd_seq = cycles_to_nsec_safe(watchdog, wd_ts0, wd_ts1);
+ if (wd_seq > WATCHDOG_READOUT_MAX_NS) {
+ /* Store for printout in case all retries fail */
+ watchdog_data.wd_seq = wd_seq;
+ continue;
+ }
+
+ /* Store for subsequent processing */
+ cs->wd_last = wd_ts0;
+ cs->cs_last = cs_ts;
+
+ /* First round or reset pending? */
+ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || reset_pending)
+ goto reset;
+
+ /* Calculate the nanosecond deltas from the last invocation */
+ wd_delta = cycles_to_nsec_safe(watchdog, wd_last, wd_ts0);
+ cs_delta = cycles_to_nsec_safe(cs, cs_last, cs_ts);
+
+ watchdog_data.wd_delta = wd_delta;
+ watchdog_data.cs_delta = cs_delta;
+
+ /*
+ * Ensure that the deltas are within the readout limits of
+ * the clocksource and the watchdog. Long delays can cause
+ * clocksources to overflow.
+ */
+ max_delta = max(wd_delta, cs_delta);
+ if (max_delta > cs->max_idle_ns || max_delta > watchdog->max_idle_ns)
+ goto reset;
+
+ /*
+ * Calculate and validate the skew against the allowed PPM
+ * value of the maximum delta plus the watchdog readout
+ * time.
+ */
+ if (abs(wd_delta - cs_delta) < (max_delta >> ppm_shift) + wd_seq)
+ return true;
+
+ watchdog_data.result = WD_FREQ_SKEWED;
+ return false;
}
- /* Don't verify ourselves. */
- cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ watchdog_data.result = WD_FREQ_TIMEOUT;
+ return false;
+
+reset:
+ cs->flags |= CLOCK_SOURCE_WATCHDOG;
+ watchdog_data.result = WD_FREQ_RESET;
+ return false;
}
-static void clocksource_verify_one_cpu(void *csin)
+/* Synchronization for sched clock */
+static void clocksource_tick_stable(struct clocksource *cs)
{
- struct clocksource *cs = (struct clocksource *)csin;
-
- csnow_mid = cs->read(cs);
+ if (cs == curr_clocksource && cs->tick_stable)
+ cs->tick_stable(cs);
}
-void clocksource_verify_percpu(struct clocksource *cs)
+/* Conditionaly enable high resolution mode */
+static void clocksource_enable_highres(struct clocksource *cs)
{
- int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
- u64 csnow_begin, csnow_end;
- int cpu, testcpu;
- s64 delta;
+ if ((cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) ||
+ !(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) ||
+ !watchdog || !(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS))
+ return;
+
+ /* Mark it valid for high-res. */
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
- if (verify_n_cpus == 0)
+ /*
+ * Can't schedule work before finished_booting is
+ * true. clocksource_done_booting will take care of it.
+ */
+ if (!finished_booting)
return;
- cpumask_clear(&cpus_ahead);
- cpumask_clear(&cpus_behind);
- cpus_read_lock();
- migrate_disable();
- clocksource_verify_choose_cpus();
- if (cpumask_empty(&cpus_chosen)) {
- migrate_enable();
- cpus_read_unlock();
- pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
return;
+
+ /*
+ * If this is not the current clocksource let the watchdog thread
+ * reselect it. Due to the change to high res this clocksource
+ * might be preferred now. If it is the current clocksource let the
+ * tick code know about that change.
+ */
+ if (cs != curr_clocksource) {
+ cs->flags |= CLOCK_SOURCE_RESELECT;
+ schedule_work(&watchdog_work);
+ } else {
+ tick_clock_notify();
}
- testcpu = smp_processor_id();
- pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
- cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
- preempt_disable();
- for_each_cpu(cpu, &cpus_chosen) {
- if (cpu == testcpu)
- continue;
- csnow_begin = cs->read(cs);
- smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
- csnow_end = cs->read(cs);
- delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
- if (delta < 0)
- cpumask_set_cpu(cpu, &cpus_behind);
- delta = (csnow_end - csnow_mid) & cs->mask;
- if (delta < 0)
- cpumask_set_cpu(cpu, &cpus_ahead);
- cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
- if (cs_nsec > cs_nsec_max)
- cs_nsec_max = cs_nsec;
- if (cs_nsec < cs_nsec_min)
- cs_nsec_min = cs_nsec;
+}
+
+static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 2);
+
+static void watchdog_print_freq_timeout(struct clocksource *cs)
+{
+ if (!__ratelimit(&ratelimit_state))
+ return;
+ pr_info("Watchdog %s read timed out. Readout sequence took: %lluns\n",
+ watchdog->name, watchdog_data.wd_seq);
+}
+
+static void watchdog_print_freq_skew(struct clocksource *cs)
+{
+ pr_warn("Marking clocksource %s unstable due to frequency skew\n", cs->name);
+ pr_warn("Watchdog %20s interval: %16lluns\n", watchdog->name, watchdog_data.wd_delta);
+ pr_warn("Clocksource %20s interval: %16lluns\n", cs->name, watchdog_data.cs_delta);
+}
+
+static void watchdog_handle_remote_timeout(struct clocksource *cs)
+{
+ pr_info_once("Watchdog remote CPU %u read timed out\n", watchdog_data.curr_cpu);
+}
+
+static void watchdog_print_remote_skew(struct clocksource *cs)
+{
+ pr_warn("Marking clocksource %s unstable due to inter CPU skew\n", cs->name);
+ if (watchdog_data.cpu_ts[0] < watchdog_data.cpu_ts[1]) {
+ pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", smp_processor_id(),
+ watchdog_data.cpu_ts[0], watchdog_data.curr_cpu, watchdog_data.cpu_ts[1]);
+ } else {
+ pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", watchdog_data.curr_cpu,
+ watchdog_data.cpu_ts[1], smp_processor_id(), watchdog_data.cpu_ts[0]);
}
- preempt_enable();
- migrate_enable();
- cpus_read_unlock();
- if (!cpumask_empty(&cpus_ahead))
- pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
- cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
- if (!cpumask_empty(&cpus_behind))
- pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
- cpumask_pr_args(&cpus_behind), testcpu, cs->name);
- pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
- testcpu, cs_nsec_min, cs_nsec_max, cs->name);
-}
-EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+}
-static inline void clocksource_reset_watchdog(void)
+static void watchdog_check_result(struct clocksource *cs)
{
- struct clocksource *cs;
+ switch (watchdog_data.result) {
+ case WD_SUCCESS:
+ clocksource_tick_stable(cs);
+ clocksource_enable_highres(cs);
+ return;
- list_for_each_entry(cs, &watchdog_list, wd_list)
+ case WD_FREQ_TIMEOUT:
+ watchdog_print_freq_timeout(cs);
+ /* Try again later and invalidate the reference timestamps. */
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
+ return;
+ case WD_FREQ_NO_WATCHDOG:
+ case WD_FREQ_RESET:
+ /*
+ * Nothing to do when the reference timestamps were reset
+ * or no watchdog clocksource registered.
+ */
+ return;
+
+ case WD_FREQ_SKEWED:
+ watchdog_print_freq_skew(cs);
+ break;
+
+ case WD_CPU_TIMEOUT:
+ /* Remote check timed out. Try again next cycle. */
+ watchdog_handle_remote_timeout(cs);
+ return;
+
+ case WD_CPU_SKEWED:
+ watchdog_print_remote_skew(cs);
+ break;
+ }
+ __clocksource_unstable(cs);
+}
static void clocksource_watchdog(struct timer_list *unused)
{
- int64_t wd_nsec, cs_nsec, interval;
- u64 csnow, wdnow, cslast, wdlast;
- int next_cpu, reset_pending;
struct clocksource *cs;
- enum wd_read_status read_ret;
- unsigned long extra_wait = 0;
- u32 md;
+ bool reset_pending;
- spin_lock(&watchdog_lock);
+ guard(spinlock)(&watchdog_lock);
if (!watchdog_running)
- goto out;
+ return;
reset_pending = atomic_read(&watchdog_reset_pending);
list_for_each_entry(cs, &watchdog_list, wd_list) {
-
/* Clocksource already marked unstable? */
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
if (finished_booting)
@@ -446,170 +659,40 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
-
- if (read_ret == WD_READ_UNSTABLE) {
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
- continue;
- }
-
- /*
- * When WD_READ_SKIP is returned, it means the system is likely
- * under very heavy load, where the latency of reading
- * watchdog/clocksource is very big, and affect the accuracy of
- * watchdog check. So give system some space and suspend the
- * watchdog check for 5 minutes.
- */
- if (read_ret == WD_READ_SKIP) {
- /*
- * As the watchdog timer will be suspended, and
- * cs->last could keep unchanged for 5 minutes, reset
- * the counters.
- */
- clocksource_reset_watchdog();
- extra_wait = HZ * 300;
- break;
- }
-
- /* Clocksource initialized ? */
- if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
- atomic_read(&watchdog_reset_pending)) {
- cs->flags |= CLOCK_SOURCE_WATCHDOG;
- cs->wd_last = wdnow;
- cs->cs_last = csnow;
- continue;
+ /* Compare against watchdog clocksource if available */
+ if (watchdog_check_freq(cs, reset_pending)) {
+ /* Check for inter CPU skew */
+ watchdog_check_cpu_skew(cs);
}
- wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
- cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
- wdlast = cs->wd_last; /* save these in case we print them */
- cslast = cs->cs_last;
- cs->cs_last = csnow;
- cs->wd_last = wdnow;
-
- if (atomic_read(&watchdog_reset_pending))
- continue;
-
- /*
- * The processing of timer softirqs can get delayed (usually
- * on account of ksoftirqd not getting to run in a timely
- * manner), which causes the watchdog interval to stretch.
- * Skew detection may fail for longer watchdog intervals
- * on account of fixed margins being used.
- * Some clocksources, e.g. acpi_pm, cannot tolerate
- * watchdog intervals longer than a few seconds.
- */
- interval = max(cs_nsec, wd_nsec);
- if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
- if (system_state > SYSTEM_SCHEDULING &&
- interval > 2 * watchdog_max_interval) {
- watchdog_max_interval = interval;
- pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
- cs_nsec, wd_nsec);
- }
- watchdog_timer.expires = jiffies;
- continue;
- }
-
- /* Check the deviation from the watchdog clocksource. */
- md = cs->uncertainty_margin + watchdog->uncertainty_margin;
- if (abs(cs_nsec - wd_nsec) > md) {
- s64 cs_wd_msec;
- s64 wd_msec;
- u32 wd_rem;
-
- pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
- smp_processor_id(), cs->name);
- pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
- watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
- cs->name, cs_nsec, csnow, cslast, cs->mask);
- cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
- wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
- pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
- cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
- if (curr_clocksource == cs)
- pr_warn(" '%s' is current clocksource.\n", cs->name);
- else if (curr_clocksource)
- pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
- else
- pr_warn(" No current clocksource.\n");
- __clocksource_unstable(cs);
- continue;
- }
-
- if (cs == curr_clocksource && cs->tick_stable)
- cs->tick_stable(cs);
-
- if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
- (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
- (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
- /* Mark it valid for high-res. */
- cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
-
- /*
- * clocksource_done_booting() will sort it if
- * finished_booting is not set yet.
- */
- if (!finished_booting)
- continue;
-
- /*
- * If this is not the current clocksource let
- * the watchdog thread reselect it. Due to the
- * change to high res this clocksource might
- * be preferred now. If it is the current
- * clocksource let the tick code know about
- * that change.
- */
- if (cs != curr_clocksource) {
- cs->flags |= CLOCK_SOURCE_RESELECT;
- schedule_work(&watchdog_work);
- } else {
- tick_clock_notify();
- }
- }
+ watchdog_check_result(cs);
}
- /*
- * We only clear the watchdog_reset_pending, when we did a
- * full cycle through all clocksources.
- */
+ /* Clear after the full clocksource walk */
if (reset_pending)
atomic_dec(&watchdog_reset_pending);
- /*
- * Cycle through CPUs to check if the CPUs stay synchronized
- * to each other.
- */
- next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask);
-
- /*
- * Arm timer if not already pending: could race with concurrent
- * pair clocksource_stop_watchdog() clocksource_start_watchdog().
- */
+ /* Could have been rearmed by a stop/start cycle */
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
- add_timer_on(&watchdog_timer, next_cpu);
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_local(&watchdog_timer);
}
-out:
- spin_unlock(&watchdog_lock);
}
static inline void clocksource_start_watchdog(void)
{
- if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+ if (watchdog_running || list_empty(&watchdog_list))
return;
- timer_setup(&watchdog_timer, clocksource_watchdog, 0);
+ timer_setup(&watchdog_timer, clocksource_watchdog, TIMER_PINNED);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+
+ add_timer_on(&watchdog_timer, get_boot_cpu_id());
watchdog_running = 1;
}
static inline void clocksource_stop_watchdog(void)
{
- if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+ if (!watchdog_running || !list_empty(&watchdog_list))
return;
timer_delete(&watchdog_timer);
watchdog_running = 0;
@@ -651,6 +734,13 @@ static void clocksource_select_watchdog(bool fallback)
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
continue;
+ /*
+ * If it's not continuous, don't put the fox in charge of
+ * the henhouse.
+ */
+ if (!(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS))
+ continue;
+
/* Skip current if we were requested for a fallback. */
if (fallback && cs == old_wd)
continue;
@@ -690,12 +780,6 @@ static int __clocksource_watchdog_kthread(void)
unsigned long flags;
int select = 0;
- /* Do any required per-CPU skew verification. */
- if (curr_clocksource &&
- curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
- curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
- clocksource_verify_percpu(curr_clocksource);
-
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
@@ -1016,6 +1100,8 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
continue;
if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
continue;
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
+ continue;
return cs;
}
return NULL;
@@ -1040,6 +1126,8 @@ static void __clocksource_select(bool skipcur)
continue;
if (strcmp(cs->name, override_name) != 0)
continue;
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
+ continue;
/*
* Check to make sure we don't switch to a non-highres
* capable clocksource if the tick code is in oneshot
@@ -1169,31 +1257,10 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
NSEC_PER_SEC / scale, sec * scale);
- }
- /*
- * If the uncertainty margin is not specified, calculate it. If
- * both scale and freq are non-zero, calculate the clock period, but
- * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default.
- * However, if either of scale or freq is zero, be very conservative
- * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value
- * for the uncertainty margin. Allow stupidly small uncertainty
- * margins to be specified by the caller for testing purposes,
- * but warn to discourage production use of this capability.
- *
- * Bottom line: The sum of the uncertainty margins of the
- * watchdog clocksource and the clocksource under test will be at
- * least 500ppm by default. For more information, please see the
- * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above.
- */
- if (scale && freq && !cs->uncertainty_margin) {
- cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
- if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
- cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
- } else if (!cs->uncertainty_margin) {
- cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+ /* Update cs::freq_khz */
+ cs->freq_khz = div_u64((u64)freq * scale, 1000);
}
- WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
/*
* Ensure clocksources that have large 'mult' values don't overflow
@@ -1241,6 +1308,10 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
cs->id = CSID_GENERIC;
+
+ if (WARN_ON_ONCE(!freq && cs->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
+ cs->flags &= ~CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT;
+
if (cs->vdso_clock_mode < 0 ||
cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 860af7a58428..5bd6efe598f0 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -50,6 +50,28 @@
#include "tick-internal.h"
/*
+ * Constants to set the queued state of the timer (INACTIVE, ENQUEUED)
+ *
+ * The callback state is kept separate in the CPU base because having it in
+ * the timer would required touching the timer after the callback, which
+ * makes it impossible to free the timer from the callback function.
+ *
+ * Therefore we track the callback state in:
+ *
+ * timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
+ * queued a signal. Between dropping the lock which protects the posix timer
+ * and reacquiring the base lock of the hrtimer, another CPU can deliver the
+ * signal and rearm the timer.
+ *
+ * All state transitions are protected by cpu_base->lock.
+ */
+#define HRTIMER_STATE_INACTIVE false
+#define HRTIMER_STATE_ENQUEUED true
+
+/*
* The resolution of the clocks. The resolution value is returned in
* the clock_getres() system call to give application programmers an
* idea of the (in)accuracy of timers. Timer values are rounded up to
@@ -77,43 +99,22 @@ static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
* to reach a base using a clockid, hrtimer_clockid_to_base()
* is used to convert from clockid to the proper hrtimer_base_type.
*/
+
+#define BASE_INIT(idx, cid) \
+ [idx] = { .index = idx, .clockid = cid }
+
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
- .clock_base =
- {
- {
- .index = HRTIMER_BASE_MONOTONIC,
- .clockid = CLOCK_MONOTONIC,
- },
- {
- .index = HRTIMER_BASE_REALTIME,
- .clockid = CLOCK_REALTIME,
- },
- {
- .index = HRTIMER_BASE_BOOTTIME,
- .clockid = CLOCK_BOOTTIME,
- },
- {
- .index = HRTIMER_BASE_TAI,
- .clockid = CLOCK_TAI,
- },
- {
- .index = HRTIMER_BASE_MONOTONIC_SOFT,
- .clockid = CLOCK_MONOTONIC,
- },
- {
- .index = HRTIMER_BASE_REALTIME_SOFT,
- .clockid = CLOCK_REALTIME,
- },
- {
- .index = HRTIMER_BASE_BOOTTIME_SOFT,
- .clockid = CLOCK_BOOTTIME,
- },
- {
- .index = HRTIMER_BASE_TAI_SOFT,
- .clockid = CLOCK_TAI,
- },
+ .clock_base = {
+ BASE_INIT(HRTIMER_BASE_MONOTONIC, CLOCK_MONOTONIC),
+ BASE_INIT(HRTIMER_BASE_REALTIME, CLOCK_REALTIME),
+ BASE_INIT(HRTIMER_BASE_BOOTTIME, CLOCK_BOOTTIME),
+ BASE_INIT(HRTIMER_BASE_TAI, CLOCK_TAI),
+ BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT, CLOCK_MONOTONIC),
+ BASE_INIT(HRTIMER_BASE_REALTIME_SOFT, CLOCK_REALTIME),
+ BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT, CLOCK_BOOTTIME),
+ BASE_INIT(HRTIMER_BASE_TAI_SOFT, CLOCK_TAI),
},
.csd = CSD_INIT(retrigger_next_event, NULL)
};
@@ -126,23 +127,43 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
return likely(base->online);
}
+#ifdef CONFIG_HIGH_RES_TIMERS
+DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key);
+
+static void hrtimer_hres_workfn(struct work_struct *work)
+{
+ static_branch_enable(&hrtimer_highres_enabled_key);
+}
+
+static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn);
+
+static inline void hrtimer_schedule_hres_work(void)
+{
+ if (!hrtimer_highres_enabled())
+ schedule_work(&hrtimer_hres_work);
+}
+#else
+static inline void hrtimer_schedule_hres_work(void) { }
+#endif
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
*/
#ifdef CONFIG_SMP
-
/*
* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
* such that hrtimer_callback_running() can unconditionally dereference
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .clock_base = { {
- .cpu_base = &migration_cpu_base,
- .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
- &migration_cpu_base.lock),
- }, },
+ .clock_base = {
+ [0] = {
+ .cpu_base = &migration_cpu_base,
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+ &migration_cpu_base.lock),
+ },
+ },
};
#define migration_base migration_cpu_base.clock_base[0]
@@ -159,15 +180,13 @@ static struct hrtimer_cpu_base migration_cpu_base = {
* possible to set timer->base = &migration_base and drop the lock: the timer
* remains locked.
*/
-static
-struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
- unsigned long *flags)
+static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
__acquires(&timer->base->lock)
{
- struct hrtimer_clock_base *base;
-
for (;;) {
- base = READ_ONCE(timer->base);
+ struct hrtimer_clock_base *base = READ_ONCE(timer->base);
+
if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
@@ -220,7 +239,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_
return expires >= new_base->cpu_base->expires_next;
}
-static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned)
{
if (!hrtimer_base_is_online(base)) {
int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
@@ -248,8 +267,7 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *
* the timer callback is currently running.
*/
static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
- int pinned)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned)
{
struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
@@ -262,13 +280,12 @@ again:
if (base != new_base) {
/*
- * We are trying to move timer to new_base.
- * However we can't change timer's base while it is running,
- * so we keep it on the same CPU. No hassle vs. reprogramming
- * the event source in the high resolution case. The softirq
- * code will take care of this when the timer function has
- * completed. There is no conflict as we hold the lock until
- * the timer is enqueued.
+ * We are trying to move timer to new_base. However we can't
+ * change timer's base while it is running, so we keep it on
+ * the same CPU. No hassle vs. reprogramming the event source
+ * in the high resolution case. The remote CPU will take care
+ * of this when the timer function has completed. There is no
+ * conflict as we hold the lock until the timer is enqueued.
*/
if (unlikely(hrtimer_callback_running(timer)))
return base;
@@ -278,8 +295,7 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
- this_cpu_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -298,14 +314,13 @@ again:
#else /* CONFIG_SMP */
-static inline struct hrtimer_clock_base *
-lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
__acquires(&timer->base->cpu_base->lock)
{
struct hrtimer_clock_base *base = timer->base;
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
-
return base;
}
@@ -340,7 +355,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div)
return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
-#endif /* BITS_PER_LONG >= 64 */
+#endif /* BITS_PER_LONG < 64 */
/*
* Add two ktime values and do a safety check for overflow:
@@ -422,12 +437,37 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
}
}
+/* Stub timer callback for improperly used timers. */
+static enum hrtimer_restart stub_timer(struct hrtimer *unused)
+{
+ WARN_ON_ONCE(1);
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * hrtimer_fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+ struct hrtimer *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_NOTAVAILABLE:
+ hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0);
+ return true;
+ default:
+ return false;
+ }
+}
+
static const struct debug_obj_descr hrtimer_debug_descr = {
- .name = "hrtimer",
- .debug_hint = hrtimer_debug_hint,
- .fixup_init = hrtimer_fixup_init,
- .fixup_activate = hrtimer_fixup_activate,
- .fixup_free = hrtimer_fixup_free,
+ .name = "hrtimer",
+ .debug_hint = hrtimer_debug_hint,
+ .fixup_init = hrtimer_fixup_init,
+ .fixup_activate = hrtimer_fixup_activate,
+ .fixup_free = hrtimer_fixup_free,
+ .fixup_assert_init = hrtimer_fixup_assert_init,
};
static inline void debug_hrtimer_init(struct hrtimer *timer)
@@ -440,8 +480,7 @@ static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
debug_object_init_on_stack(timer, &hrtimer_debug_descr);
}
-static inline void debug_hrtimer_activate(struct hrtimer *timer,
- enum hrtimer_mode mode)
+static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode)
{
debug_object_activate(timer, &hrtimer_debug_descr);
}
@@ -451,6 +490,11 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
debug_object_deactivate(timer, &hrtimer_debug_descr);
}
+static inline void debug_hrtimer_assert_init(struct hrtimer *timer)
+{
+ debug_object_assert_init(timer, &hrtimer_debug_descr);
+}
+
void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
@@ -461,9 +505,9 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
-static inline void debug_hrtimer_activate(struct hrtimer *timer,
- enum hrtimer_mode mode) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { }
#endif
static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
@@ -479,80 +523,80 @@ static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid
trace_hrtimer_setup(timer, clockid, mode);
}
-static inline void debug_activate(struct hrtimer *timer,
- enum hrtimer_mode mode)
+static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed)
{
debug_hrtimer_activate(timer, mode);
- trace_hrtimer_start(timer, mode);
+ trace_hrtimer_start(timer, mode, was_armed);
}
-static inline void debug_deactivate(struct hrtimer *timer)
-{
- debug_hrtimer_deactivate(timer);
- trace_hrtimer_cancel(timer);
-}
+#define for_each_active_base(base, cpu_base, active) \
+ for (unsigned int idx = ffs(active); idx--; idx = ffs((active))) \
+ for (bool done = false; !done; active &= ~(1U << idx)) \
+ for (base = &cpu_base->clock_base[idx]; !done; done = true)
-static struct hrtimer_clock_base *
-__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
+#define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node)
+
+#if defined(CONFIG_NO_HZ_COMMON)
+/*
+ * Same as hrtimer_bases_next_event() below, but skips the excluded timer and
+ * does not update cpu_base->next_timer/expires.
+ */
+static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base,
+ const struct hrtimer *exclude,
+ unsigned int active, ktime_t expires_next)
{
- unsigned int idx;
+ struct hrtimer_clock_base *base;
+ ktime_t expires;
- if (!*active)
- return NULL;
+ lockdep_assert_held(&cpu_base->lock);
- idx = __ffs(*active);
- *active &= ~(1U << idx);
+ for_each_active_base(base, cpu_base, active) {
+ expires = ktime_sub(base->expires_next, base->offset);
+ if (expires >= expires_next)
+ continue;
+
+ /*
+ * If the excluded timer is the first on this base evaluate the
+ * next timer.
+ */
+ struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active);
- return &cpu_base->clock_base[idx];
+ if (unlikely(&exclude->node == node)) {
+ node = timerqueue_linked_next(node);
+ if (!node)
+ continue;
+ expires = ktime_sub(node->expires, base->offset);
+ if (expires >= expires_next)
+ continue;
+ }
+ expires_next = expires;
+ }
+ /* If base->offset changed, the result might be negative */
+ return max(expires_next, 0);
}
+#endif
-#define for_each_active_base(base, cpu_base, active) \
- while ((base = __next_base((cpu_base), &(active))))
+static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
-static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
- const struct hrtimer *exclude,
- unsigned int active,
- ktime_t expires_next)
+ return hrtimer_from_timerqueue_node(next);
+}
+
+/* Find the base with the earliest expiry */
+static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active,
+ ktime_t *expires_next, struct hrtimer **next_timer)
{
struct hrtimer_clock_base *base;
ktime_t expires;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *next;
- struct hrtimer *timer;
-
- next = timerqueue_getnext(&base->active);
- timer = container_of(next, struct hrtimer, node);
- if (timer == exclude) {
- /* Get to the next timer in the queue. */
- next = timerqueue_iterate_next(next);
- if (!next)
- continue;
-
- timer = container_of(next, struct hrtimer, node);
- }
- expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- if (expires < expires_next) {
- expires_next = expires;
-
- /* Skip cpu_base update if a timer is being excluded. */
- if (exclude)
- continue;
-
- if (timer->is_soft)
- cpu_base->softirq_next_timer = timer;
- else
- cpu_base->next_timer = timer;
+ expires = ktime_sub(base->expires_next, base->offset);
+ if (expires < *expires_next) {
+ *expires_next = expires;
+ *next_timer = clock_base_next_timer(base);
}
}
- /*
- * clock_was_set() might have changed base->offset of any of
- * the clock bases so the result might be negative. Fix it up
- * to prevent a false positive in clockevents_program_event().
- */
- if (expires_next < 0)
- expires_next = 0;
- return expires_next;
}
/*
@@ -575,30 +619,28 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
* - HRTIMER_ACTIVE_SOFT, or
* - HRTIMER_ACTIVE_HARD.
*/
-static ktime_t
-__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
- unsigned int active;
struct hrtimer *next_timer = NULL;
ktime_t expires_next = KTIME_MAX;
+ unsigned int active;
+
+ lockdep_assert_held(&cpu_base->lock);
if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
- cpu_base->softirq_next_timer = NULL;
- expires_next = __hrtimer_next_event_base(cpu_base, NULL,
- active, KTIME_MAX);
-
- next_timer = cpu_base->softirq_next_timer;
+ if (active)
+ hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
+ cpu_base->softirq_next_timer = next_timer;
}
if (active_mask & HRTIMER_ACTIVE_HARD) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ if (active)
+ hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
cpu_base->next_timer = next_timer;
- expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
- expires_next);
}
-
- return expires_next;
+ return max(expires_next, 0);
}
static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
@@ -638,8 +680,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
- offs_real, offs_boot, offs_tai);
+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real,
+ offs_boot, offs_tai);
base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
@@ -649,7 +691,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
}
/*
- * Is the high resolution mode active ?
+ * Is the high resolution mode active in the CPU base. This cannot use the
+ * static key as the CPUs are switched to high resolution mode
+ * asynchronously.
*/
static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
@@ -657,8 +701,13 @@ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
cpu_base->hres_active : 0;
}
-static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer *next_timer,
+static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred)
+{
+ trace_hrtimer_rearm(expires_next, deferred);
+ tick_program_event(expires_next, 1);
+}
+
+static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer,
ktime_t expires_next)
{
cpu_base->expires_next = expires_next;
@@ -683,20 +732,13 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
- tick_program_event(expires_next, 1);
+ hrtimer_rearm_event(expires_next, false);
}
-/*
- * Reprogram the event source with checking both queues for the
- * next event
- * Called with interrupts disabled and base->lock held
- */
-static void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+/* Reprogram the event source with a evaluation of all clock bases */
+static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal)
{
- ktime_t expires_next;
-
- expires_next = hrtimer_update_next_event(cpu_base);
+ ktime_t expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -707,57 +749,49 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
-/*
- * High resolution timer enabled ?
- */
+/* High resolution timer enabled ? */
static bool hrtimer_hres_enabled __read_mostly = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);
-/*
- * Enable / Disable high resolution mode
- */
+/* Enable / Disable high resolution mode */
static int __init setup_hrtimer_hres(char *str)
{
return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
-
__setup("highres=", setup_hrtimer_hres);
-/*
- * hrtimer_high_res_enabled - query, if the highres mode is enabled
- */
-static inline int hrtimer_is_hres_enabled(void)
+/* hrtimer_high_res_enabled - query, if the highres mode is enabled */
+static inline bool hrtimer_is_hres_enabled(void)
{
return hrtimer_hres_enabled;
}
-/*
- * Switch to high resolution mode
- */
+/* Switch to high resolution mode */
static void hrtimer_switch_to_hres(void)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
- pr_warn("Could not switch to high resolution mode on CPU %u\n",
- base->cpu);
+ pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu);
return;
}
- base->hres_active = 1;
+ base->hres_active = true;
hrtimer_resolution = HIGH_RES_NSEC;
tick_setup_sched_timer(true);
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
+ hrtimer_schedule_hres_work();
}
#else
-static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline bool hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
+
/*
* Retrigger next event is called after clock was set with interrupts
* disabled through an SMP function call or directly from low level
@@ -792,13 +826,12 @@ static void retrigger_next_event(void *arg)
* In periodic low resolution mode, the next softirq expiration
* must also be updated.
*/
- raw_spin_lock(&base->lock);
+ guard(raw_spinlock)(&base->lock);
hrtimer_update_base(base);
if (hrtimer_hres_active(base))
- hrtimer_force_reprogram(base, 0);
+ hrtimer_force_reprogram(base, /* skip_equal */ false);
else
hrtimer_update_next_event(base);
- raw_spin_unlock(&base->lock);
}
/*
@@ -812,10 +845,11 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *base = timer->base;
- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ ktime_t expires = hrtimer_get_expires(timer);
- WARN_ON_ONCE(hrtimer_get_expires(timer) < 0);
+ WARN_ON_ONCE(expires < 0);
+ expires = ktime_sub(expires, base->offset);
/*
* CLOCK_REALTIME timer might be requested with an absolute
* expiry time which is less than base->offset. Set it to 0.
@@ -842,8 +876,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
timer_cpu_base->softirq_next_timer = timer;
timer_cpu_base->softirq_expires_next = expires;
- if (!ktime_before(expires, timer_cpu_base->expires_next) ||
- !reprogram)
+ if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram)
return;
}
@@ -857,11 +890,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
if (expires >= cpu_base->expires_next)
return;
- /*
- * If the hrtimer interrupt is running, then it will reevaluate the
- * clock bases and reprogram the clock event device.
- */
- if (cpu_base->in_hrtirq)
+ /* If a deferred rearm is pending skip reprogramming the device */
+ if (cpu_base->deferred_rearm)
return;
cpu_base->next_timer = timer;
@@ -869,8 +899,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
__hrtimer_reprogram(cpu_base, timer, expires);
}
-static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
- unsigned int active)
+static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active)
{
struct hrtimer_clock_base *base;
unsigned int seq;
@@ -896,13 +925,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
if (seq == cpu_base->clock_was_set_seq)
return false;
- /*
- * If the remote CPU is currently handling an hrtimer interrupt, it
- * will reevaluate the first expiring timer of all clock bases
- * before reprogramming. Nothing to do here.
- */
- if (cpu_base->in_hrtirq)
+ /* If a deferred rearm is pending the remote CPU will take care of it */
+ if (cpu_base->deferred_rearm) {
+ cpu_base->deferred_needs_update = true;
return false;
+ }
/*
* Walk the affected clock bases and check whether the first expiring
@@ -913,9 +940,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
active &= cpu_base->active_bases;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *next;
+ struct timerqueue_linked_node *next;
- next = timerqueue_getnext(&base->active);
+ next = timerqueue_linked_first(&base->active);
expires = ktime_sub(next->expires, base->offset);
if (expires < cpu_base->expires_next)
return true;
@@ -947,11 +974,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
*/
void clock_was_set(unsigned int bases)
{
- struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
cpumask_var_t mask;
- int cpu;
- if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active())
+ if (!hrtimer_highres_enabled() && !tick_nohz_is_active())
goto out_timerfd;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -960,23 +985,19 @@ void clock_was_set(unsigned int bases)
}
/* Avoid interrupting CPUs if possible */
- cpus_read_lock();
- for_each_online_cpu(cpu) {
- unsigned long flags;
-
- cpu_base = &per_cpu(hrtimer_bases, cpu);
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
+ scoped_guard(cpus_read_lock) {
+ int cpu;
- if (update_needs_ipi(cpu_base, bases))
- cpumask_set_cpu(cpu, mask);
+ for_each_online_cpu(cpu) {
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
+ if (update_needs_ipi(cpu_base, bases))
+ cpumask_set_cpu(cpu, mask);
+ }
+ scoped_guard(preempt)
+ smp_call_function_many(mask, retrigger_next_event, NULL, 1);
}
-
- preempt_disable();
- smp_call_function_many(mask, retrigger_next_event, NULL, 1);
- preempt_enable();
- cpus_read_unlock();
free_cpumask_var(mask);
out_timerfd:
@@ -1011,11 +1032,8 @@ void hrtimers_resume_local(void)
retrigger_next_event(NULL);
}
-/*
- * Counterpart to lock_hrtimer_base above:
- */
-static inline
-void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+/* Counterpart to lock_hrtimer_base above */
+static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
__releases(&timer->base->cpu_base->lock)
{
raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
@@ -1032,7 +1050,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
* .. note::
* This only updates the timer expiry value and does not requeue the timer.
*
- * There is also a variant of the function hrtimer_forward_now().
+ * There is also a variant of this function: hrtimer_forward_now().
*
* Context: Can be safely called from the callback function of @timer. If called
* from other contexts @timer must neither be enqueued nor running the
@@ -1042,15 +1060,15 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
*/
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
- u64 orun = 1;
ktime_t delta;
+ u64 orun = 1;
delta = ktime_sub(now, hrtimer_get_expires(timer));
if (delta < 0)
return 0;
- if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+ if (WARN_ON(timer->is_queued))
return 0;
if (interval < hrtimer_resolution)
@@ -1079,73 +1097,98 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* enqueue_hrtimer - internal function to (re)start a timer
*
* The timer is inserted in expiry order. Insertion into the
- * red black tree is O(log(n)). Must hold the base lock.
+ * red black tree is O(log(n)).
*
* Returns true when the new timer is the leftmost timer in the tree.
*/
static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
- enum hrtimer_mode mode)
+ enum hrtimer_mode mode, bool was_armed)
{
- debug_activate(timer, mode);
+ lockdep_assert_held(&base->cpu_base->lock);
+
+ debug_activate(timer, mode, was_armed);
WARN_ON_ONCE(!base->cpu_base->online);
base->cpu_base->active_bases |= 1 << base->index;
/* Pairs with the lockless read in hrtimer_is_queued() */
- WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
+ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
+
+ if (!timerqueue_linked_add(&base->active, &timer->node))
+ return false;
+
+ base->expires_next = hrtimer_get_expires(timer);
+ return true;
+}
- return timerqueue_add(&base->active, &timer->node);
+static inline void base_update_next_timer(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
+
+ base->expires_next = next ? next->expires : KTIME_MAX;
}
/*
* __remove_hrtimer - internal function to remove a timer
*
- * Caller must hold the base lock.
- *
* High resolution timer mode reprograms the clock event device when the
* timer is the one which expires next. The caller can disable this by setting
* reprogram to zero. This is useful, when the context does a reprogramming
* anyway (e.g. timer interrupt)
*/
-static void __remove_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- u8 newstate, int reprogram)
+static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool newstate, bool reprogram)
{
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- u8 state = timer->state;
+ bool was_first;
- /* Pairs with the lockless read in hrtimer_is_queued() */
- WRITE_ONCE(timer->state, newstate);
- if (!(state & HRTIMER_STATE_ENQUEUED))
+ lockdep_assert_held(&cpu_base->lock);
+
+ if (!timer->is_queued)
return;
- if (!timerqueue_del(&base->active, &timer->node))
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->is_queued, newstate);
+
+ was_first = !timerqueue_linked_prev(&timer->node);
+
+ if (!timerqueue_linked_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
+ /* Nothing to update if this was not the first timer in the base */
+ if (!was_first)
+ return;
+
+ base_update_next_timer(base);
+
/*
- * Note: If reprogram is false we do not update
- * cpu_base->next_timer. This happens when we remove the first
- * timer on a remote cpu. No harm as we never dereference
- * cpu_base->next_timer. So the worst thing what can happen is
- * an superfluous call to hrtimer_force_reprogram() on the
- * remote cpu later on if the same timer gets enqueued again.
+ * If reprogram is false don't update cpu_base->next_timer and do not
+ * touch the clock event device.
+ *
+ * This happens when removing the first timer on a remote CPU, which
+ * will be handled by the remote CPU's interrupt. It also happens when
+ * a local timer is removed to be immediately restarted. That's handled
+ * at the call site.
*/
- if (reprogram && timer == cpu_base->next_timer)
- hrtimer_force_reprogram(cpu_base, 1);
+ if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy)
+ return;
+
+ if (cpu_base->deferred_rearm)
+ cpu_base->deferred_needs_update = true;
+ else
+ hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
}
-/*
- * remove hrtimer, called with base lock held
- */
-static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
- bool restart, bool keep_local)
+static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool newstate)
{
- u8 state = timer->state;
+ lockdep_assert_held(&base->cpu_base->lock);
- if (state & HRTIMER_STATE_ENQUEUED) {
+ if (timer->is_queued) {
bool reprogram;
+ debug_hrtimer_deactivate(timer);
+
/*
* Remove the timer and force reprogramming when high
* resolution mode is active and the timer is on the current
@@ -1154,24 +1197,81 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
* reprogramming happens in the interrupt handler. This is a
* rare case and less expensive than a smp call.
*/
- debug_deactivate(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * If the timer is not restarted then reprogramming is
- * required if the timer is local. If it is local and about
- * to be restarted, avoid programming it twice (on removal
- * and a moment later when it's requeued).
- */
- if (!restart)
- state = HRTIMER_STATE_INACTIVE;
- else
- reprogram &= !keep_local;
+ __remove_hrtimer(timer, base, newstate, reprogram);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Update in place has to retrieve the expiry times of the neighbour nodes
+ * if they exist. That is cache line neutral because the dequeue/enqueue
+ * operation is going to need the same cache lines. But there is a big win
+ * when the dequeue/enqueue can be avoided because the RB tree does not
+ * have to be rebalanced twice.
+ */
+static inline bool
+hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node);
+ struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node);
+
+ /* If the new expiry goes behind the next timer, requeue is required */
+ if (next && expires > next->expires)
+ return false;
+
+ /* If this is the first timer, update in place */
+ if (!prev)
+ return true;
+
+ /* Update in place when it does not go ahead of the previous one */
+ return expires >= prev->expires;
+}
+
+static inline bool
+remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns)
+{
+ bool was_first = false;
+
+ /* Remove it from the timer queue if active */
+ if (timer->is_queued) {
+ was_first = !timerqueue_linked_prev(&timer->node);
+
+ /* Try to update in place to avoid the de/enqueue dance */
+ if (hrtimer_can_update_in_place(timer, base, expires)) {
+ hrtimer_set_expires_range_ns(timer, expires, delta_ns);
+ trace_hrtimer_start(timer, mode, true);
+ if (was_first)
+ base->expires_next = expires;
+ return was_first;
+ }
- __remove_hrtimer(timer, base, state, reprogram);
- return 1;
+ debug_hrtimer_deactivate(timer);
+ timerqueue_linked_del(&base->active, &timer->node);
}
- return 0;
+
+ /* Set the new expiry time */
+ hrtimer_set_expires_range_ns(timer, expires, delta_ns);
+
+ debug_activate(timer, mode, timer->is_queued);
+ base->cpu_base->active_bases |= 1 << base->index;
+
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
+
+ /* If it's the first expiring timer now or again, update base */
+ if (timerqueue_linked_add(&base->active, &timer->node)) {
+ base->expires_next = expires;
+ return true;
+ }
+
+ if (was_first)
+ base_update_next_timer(base);
+
+ return false;
}
static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
@@ -1190,55 +1290,93 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
return tim;
}
-static void
-hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
+static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
- ktime_t expires;
-
- /*
- * Find the next SOFT expiration.
- */
- expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+ ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
/*
- * reprogramming needs to be triggered, even if the next soft
- * hrtimer expires at the same time than the next hard
+ * Reprogramming needs to be triggered, even if the next soft
+ * hrtimer expires at the same time as the next hard
* hrtimer. cpu_base->softirq_expires_next needs to be updated!
*/
if (expires == KTIME_MAX)
return;
/*
- * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
- * cpu_base->*expires_next is only set by hrtimer_reprogram()
+ * cpu_base->next_timer is recomputed by __hrtimer_get_next_event()
+ * cpu_base->expires_next is only set by hrtimer_reprogram()
*/
hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}
-static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode,
- struct hrtimer_clock_base *base)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
+{
+ if (static_branch_likely(&timers_migration_enabled)) {
+ /*
+ * If it is local and the first expiring timer keep it on the local
+ * CPU to optimize reprogramming of the clockevent device. Also
+ * avoid switch_hrtimer_base() overhead when local and pinned.
+ */
+ if (!is_local)
+ return false;
+ if (is_first || is_pinned)
+ return true;
+
+ /* Honour the NOHZ full restrictions */
+ if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE))
+ return false;
+
+ /*
+ * If the tick is not stopped or need_resched() is set, then
+ * there is no point in moving the timer somewhere else.
+ */
+ return !tick_nohz_tick_stopped() || need_resched();
+ }
+ return is_local;
+}
+#else
+static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
+{
+ return is_local;
+}
+#endif
+
+static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first,
+ bool is_pinned)
+{
+ /* If the timer is running the callback it has to stay on its CPU base. */
+ if (unlikely(timer->base->running == timer))
+ return true;
+
+ return hrtimer_prefer_local(is_local, is_first, is_pinned);
+}
+
+static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
{
struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
- struct hrtimer_clock_base *new_base;
- bool force_local, first;
+ bool is_pinned, first, was_first, keep_base = false;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- /*
- * If the timer is on the local cpu base and is the first expiring
- * timer then this might end up reprogramming the hardware twice
- * (on removal and on enqueue). To avoid that by prevent the
- * reprogram on removal, keep the timer local to the current CPU
- * and enforce reprogramming after it is queued no matter whether
- * it is the new first expiring timer again or not.
- */
- force_local = base->cpu_base == this_cpu_base;
- force_local &= base->cpu_base->next_timer == timer;
+ was_first = cpu_base->next_timer == timer;
+ is_pinned = !!(mode & HRTIMER_MODE_PINNED);
/*
- * Don't force local queuing if this enqueue happens on a unplugged
- * CPU after hrtimer_cpu_dying() has been invoked.
+ * Don't keep it local if this enqueue happens on a unplugged CPU
+ * after hrtimer_cpu_dying() has been invoked.
*/
- force_local &= this_cpu_base->online;
+ if (likely(this_cpu_base->online)) {
+ bool is_local = cpu_base == this_cpu_base;
+
+ keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned);
+ }
+
+ /* Calculate absolute expiry time for relative timers */
+ if (mode & HRTIMER_MODE_REL)
+ tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
+ /* Compensate for low resolution granularity */
+ tim = hrtimer_update_lowres(timer, tim, mode);
/*
* Remove an active timer from the queue. In case it is not queued
@@ -1250,32 +1388,41 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* reprogramming later if it was the first expiring timer. This
* avoids programming the underlying clock event twice (once at
* removal and once after enqueue).
+ *
+ * @keep_base is also true if the timer callback is running on a
+ * remote CPU and for local pinned timers.
*/
- remove_hrtimer(timer, base, true, force_local);
+ if (likely(keep_base)) {
+ first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns);
+ } else {
+ /* Keep the ENQUEUED state in case it is queued */
+ bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED);
- if (mode & HRTIMER_MODE_REL)
- tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
+ hrtimer_set_expires_range_ns(timer, tim, delta_ns);
- tim = hrtimer_update_lowres(timer, tim, mode);
+ /* Switch the timer base, if necessary: */
+ base = switch_hrtimer_base(timer, base, is_pinned);
+ cpu_base = base->cpu_base;
- hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+ first = enqueue_hrtimer(timer, base, mode, was_armed);
+ }
- /* Switch the timer base, if necessary: */
- if (!force_local) {
- new_base = switch_hrtimer_base(timer, base,
- mode & HRTIMER_MODE_PINNED);
- } else {
- new_base = base;
+ /* If a deferred rearm is pending skip reprogramming the device */
+ if (cpu_base->deferred_rearm) {
+ cpu_base->deferred_needs_update = true;
+ return false;
}
- first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local) {
+ if (!was_first || cpu_base != this_cpu_base) {
/*
- * If the current CPU base is online, then the timer is
- * never queued on a remote CPU if it would be the first
- * expiring timer there.
+ * If the current CPU base is online, then the timer is never
+ * queued on a remote CPU if it would be the first expiring
+ * timer there unless the timer callback is currently executed
+ * on the remote CPU. In the latter case the remote CPU will
+ * re-evaluate the first expiring timer after completing the
+ * callbacks.
*/
- if (hrtimer_base_is_online(this_cpu_base))
+ if (likely(hrtimer_base_is_online(this_cpu_base)))
return first;
/*
@@ -1283,21 +1430,33 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* already offline. If the timer is the first to expire,
* kick the remote CPU to reprogram the clock event.
*/
- if (first) {
- struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+ if (first)
+ smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
+ return false;
+ }
- smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
- }
- return 0;
+ /*
+ * Special case for the HRTICK timer. It is frequently rearmed and most
+ * of the time moves the expiry into the future. That's expensive in
+ * virtual machines and it's better to take the pointless already armed
+ * interrupt than reprogramming the hardware on every context switch.
+ *
+ * If the new expiry is before the armed time, then reprogramming is
+ * required.
+ */
+ if (timer->is_lazy) {
+ if (cpu_base->expires_next <= hrtimer_get_expires(timer))
+ return false;
}
/*
- * Timer was forced to stay on the current CPU to avoid
- * reprogramming on removal and enqueue. Force reprogram the
- * hardware by evaluating the new first expiring timer.
+ * Timer was the first expiring timer and forced to stay on the
+ * current CPU to avoid reprogramming on removal and enqueue. Force
+ * reprogram the hardware by evaluating the new first expiring
+ * timer.
*/
- hrtimer_force_reprogram(new_base->cpu_base, 1);
- return 0;
+ hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
+ return false;
}
/**
@@ -1309,12 +1468,14 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
* softirq based mode is considered for debug purpose only!
*/
-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode)
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base;
unsigned long flags;
+ debug_hrtimer_assert_init(timer);
+
/*
* Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
* match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
@@ -1362,8 +1523,11 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
base = lock_hrtimer_base(timer, &flags);
- if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base, false, false);
+ if (!hrtimer_callback_running(timer)) {
+ ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE);
+ if (ret)
+ trace_hrtimer_cancel(timer);
+ }
unlock_hrtimer_base(timer, &flags);
@@ -1397,8 +1561,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
* the timer callback to finish. Drop expiry_lock and reacquire it. That
* allows the waiter to acquire the lock and make progress.
*/
-static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
- unsigned long flags)
+static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags)
{
if (atomic_read(&cpu_base->timer_waiters)) {
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1463,14 +1626,10 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer)
spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
-static inline void
-hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
-static inline void
-hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
-static inline void
-hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
- unsigned long flags) { }
+static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { }
#endif
/**
@@ -1526,15 +1685,11 @@ u64 hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
u64 expires = KTIME_MAX;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
if (!hrtimer_hres_active(cpu_base))
expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
-
return expires;
}
@@ -1549,26 +1704,20 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
u64 expires = KTIME_MAX;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
-
- if (hrtimer_hres_active(cpu_base)) {
- unsigned int active;
+ unsigned int active;
- if (!cpu_base->softirq_activated) {
- active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
- expires = __hrtimer_next_event_base(cpu_base, exclude,
- active, KTIME_MAX);
- }
- active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
- expires = __hrtimer_next_event_base(cpu_base, exclude, active,
- expires);
- }
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
+ if (!hrtimer_hres_active(cpu_base))
+ return expires;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ if (active && !cpu_base->softirq_activated)
+ expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX);
- return expires;
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ if (!active)
+ return expires;
+ return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires);
}
#endif
@@ -1612,8 +1761,7 @@ ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
-static void __hrtimer_setup(struct hrtimer *timer,
- enum hrtimer_restart (*function)(struct hrtimer *),
+static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *),
clockid_t clock_id, enum hrtimer_mode mode)
{
bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
@@ -1645,13 +1793,14 @@ static void __hrtimer_setup(struct hrtimer *timer,
base += hrtimer_clockid_to_base(clock_id);
timer->is_soft = softtimer;
timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
+ timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM);
timer->base = &cpu_base->clock_base[base];
- timerqueue_init(&timer->node);
+ timerqueue_linked_init(&timer->node);
- if (WARN_ON_ONCE(!function))
+ if (WARN_ON_ONCE(!fn))
ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
else
- ACCESS_PRIVATE(timer, function) = function;
+ ACCESS_PRIVATE(timer, function) = fn;
}
/**
@@ -1710,12 +1859,10 @@ bool hrtimer_active(const struct hrtimer *timer)
base = READ_ONCE(timer->base);
seq = raw_read_seqcount_begin(&base->seq);
- if (timer->state != HRTIMER_STATE_INACTIVE ||
- base->running == timer)
+ if (timer->is_queued || base->running == timer)
return true;
- } while (read_seqcount_retry(&base->seq, seq) ||
- base != READ_ONCE(timer->base));
+ } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base));
return false;
}
@@ -1729,7 +1876,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* - callback: the timer is being ran
* - post: the timer is inactive or (re)queued
*
- * On the read side we ensure we observe timer->state and cpu_base->running
+ * On the read side we ensure we observe timer->is_queued and cpu_base->running
* from the same section, if anything changed while we looked at it, we retry.
* This includes timer->base changing because sequence numbers alone are
* insufficient for that.
@@ -1738,11 +1885,9 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* a false negative if the read side got smeared over multiple consecutive
* __run_hrtimer() invocations.
*/
-
-static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer_clock_base *base,
- struct hrtimer *timer, ktime_t *now,
- unsigned long flags) __must_hold(&cpu_base->lock)
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base,
+ struct hrtimer *timer, ktime_t now, unsigned long flags)
+ __must_hold(&cpu_base->lock)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
bool expires_in_hardirq;
@@ -1754,15 +1899,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
base->running = timer;
/*
- * Separate the ->running assignment from the ->state assignment.
+ * Separate the ->running assignment from the ->is_queued assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running == NULL &&
- * timer->state == INACTIVE.
+ * timer->is_queued == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
- __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false);
fn = ACCESS_PRIVATE(timer, function);
/*
@@ -1797,16 +1942,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
* hrtimer_start_range_ns() can have popped in and enqueued the timer
* for us already.
*/
- if (restart != HRTIMER_NORESTART &&
- !(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+ if (restart == HRTIMER_RESTART && !timer->is_queued)
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false);
/*
- * Separate the ->running assignment from the ->state assignment.
+ * Separate the ->running assignment from the ->is_queued assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running.timer == NULL &&
- * timer->state == INACTIVE.
+ * timer->is_queued == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
@@ -1814,23 +1958,24 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
base->running = NULL;
}
+static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
+
+ return next ? hrtimer_from_timerqueue_node(next) : NULL;
+}
+
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
unsigned long flags, unsigned int active_mask)
{
- struct hrtimer_clock_base *base;
unsigned int active = cpu_base->active_bases & active_mask;
+ struct hrtimer_clock_base *base;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *node;
- ktime_t basenow;
-
- basenow = ktime_add(now, base->offset);
-
- while ((node = timerqueue_getnext(&base->active))) {
- struct hrtimer *timer;
-
- timer = container_of(node, struct hrtimer, node);
+ ktime_t basenow = ktime_add(now, base->offset);
+ struct hrtimer *timer;
+ while ((timer = clock_base_next_timer(base))) {
/*
* The immediate goal for using the softexpires is
* minimizing wakeups, not running timers at the
@@ -1846,7 +1991,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
if (basenow < hrtimer_get_softexpires(timer))
break;
- __run_hrtimer(cpu_base, base, timer, &basenow, flags);
+ __run_hrtimer(cpu_base, base, timer, basenow, flags);
if (active_mask == HRTIMER_ACTIVE_SOFT)
hrtimer_sync_wait_running(cpu_base, flags);
}
@@ -1865,7 +2010,7 @@ static __latent_entropy void hrtimer_run_softirq(void)
now = hrtimer_update_base(cpu_base);
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
- cpu_base->softirq_activated = 0;
+ cpu_base->softirq_activated = false;
hrtimer_update_softirq_timer(cpu_base, true);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1875,6 +2020,63 @@ static __latent_entropy void hrtimer_run_softirq(void)
#ifdef CONFIG_HIGH_RES_TIMERS
/*
+ * Very similar to hrtimer_force_reprogram(), except it deals with
+ * deferred_rearm and hang_detected.
+ */
+static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred)
+{
+ cpu_base->expires_next = expires_next;
+ cpu_base->deferred_rearm = false;
+
+ if (unlikely(cpu_base->hang_detected)) {
+ /*
+ * Give the system a chance to do something else than looping
+ * on hrtimer interrupts.
+ */
+ expires_next = ktime_add_ns(ktime_get(),
+ min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time));
+ }
+ hrtimer_rearm_event(expires_next, deferred);
+}
+
+#ifdef CONFIG_HRTIMER_REARM_DEFERRED
+void __hrtimer_rearm_deferred(void)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires_next;
+
+ if (!cpu_base->deferred_rearm)
+ return;
+
+ guard(raw_spinlock)(&cpu_base->lock);
+ if (cpu_base->deferred_needs_update) {
+ hrtimer_update_base(cpu_base);
+ expires_next = hrtimer_update_next_event(cpu_base);
+ } else {
+ /* No timer added/removed. Use the cached value */
+ expires_next = cpu_base->deferred_expires_next;
+ }
+ hrtimer_rearm(cpu_base, expires_next, true);
+}
+
+static __always_inline void
+hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
+{
+ /* hrtimer_interrupt() just re-evaluated the first expiring timer */
+ cpu_base->deferred_needs_update = false;
+ /* Cache the expiry time */
+ cpu_base->deferred_expires_next = expires_next;
+ set_thread_flag(TIF_HRTIMER_REARM);
+}
+#else /* CONFIG_HRTIMER_REARM_DEFERRED */
+static __always_inline void
+hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
+{
+ hrtimer_rearm(cpu_base, expires_next, false);
+}
+#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */
+
+/*
* High resolution timer interrupt
* Called with interrupts disabled
*/
@@ -1888,86 +2090,55 @@ void hrtimer_interrupt(struct clock_event_device *dev)
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
- cpu_base->in_hrtirq = 1;
+ cpu_base->deferred_rearm = true;
/*
- * We set expires_next to KTIME_MAX here with cpu_base->lock
- * held to prevent that a timer is enqueued in our queue via
- * the migration code. This does not affect enqueueing of
- * timers which run their callback and need to be requeued on
- * this CPU.
+ * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue
+ * timers while __hrtimer_run_queues() is expiring the clock bases.
+ * Timers which are re/enqueued on the local CPU are not affected by
+ * this.
*/
cpu_base->expires_next = KTIME_MAX;
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->softirq_activated = 1;
+ cpu_base->softirq_activated = true;
raise_timer_softirq(HRTIMER_SOFTIRQ);
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
- /* Reevaluate the clock bases for the [soft] next expiry */
- expires_next = hrtimer_update_next_event(cpu_base);
- /*
- * Store the new expiry value so the migration code can verify
- * against it.
- */
- cpu_base->expires_next = expires_next;
- cpu_base->in_hrtirq = 0;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
-
- /* Reprogramming necessary ? */
- if (!tick_program_event(expires_next, 0)) {
- cpu_base->hang_detected = 0;
- return;
- }
-
/*
* The next timer was already expired due to:
* - tracing
* - long lasting callbacks
* - being scheduled away when running in a VM
*
- * We need to prevent that we loop forever in the hrtimer
- * interrupt routine. We give it 3 attempts to avoid
- * overreacting on some spurious event.
- *
- * Acquire base lock for updating the offsets and retrieving
- * the current time.
+ * We need to prevent that we loop forever in the hrtiner interrupt
+ * routine. We give it 3 attempts to avoid overreacting on some
+ * spurious event.
*/
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
- cpu_base->nr_retries++;
- if (++retries < 3)
- goto retry;
- /*
- * Give the system a chance to do something else than looping
- * here. We stored the entry time, so we know exactly how long
- * we spent here. We schedule the next event this amount of
- * time away.
- */
- cpu_base->nr_hangs++;
- cpu_base->hang_detected = 1;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ expires_next = hrtimer_update_next_event(cpu_base);
+ cpu_base->hang_detected = false;
+ if (expires_next < now) {
+ if (++retries < 3)
+ goto retry;
+
+ delta = ktime_sub(now, entry_time);
+ cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta);
+ cpu_base->nr_hangs++;
+ cpu_base->hang_detected = true;
+ }
- delta = ktime_sub(now, entry_time);
- if ((unsigned int)delta > cpu_base->max_hang_time)
- cpu_base->max_hang_time = (unsigned int) delta;
- /*
- * Limit it to a sensible value as we enforce a longer
- * delay. Give the CPU at least 100ms to catch up.
- */
- if (delta > 100 * NSEC_PER_MSEC)
- expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
- else
- expires_next = ktime_add(now, delta);
- tick_program_event(expires_next, 1);
- pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
+ hrtimer_interrupt_rearm(cpu_base, expires_next);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
+
#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
@@ -1999,7 +2170,7 @@ void hrtimer_run_queues(void)
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->softirq_activated = 1;
+ cpu_base->softirq_activated = true;
raise_timer_softirq(HRTIMER_SOFTIRQ);
}
@@ -2012,8 +2183,7 @@ void hrtimer_run_queues(void)
*/
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
- struct hrtimer_sleeper *t =
- container_of(timer, struct hrtimer_sleeper, timer);
+ struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer);
struct task_struct *task = t->task;
t->task = NULL;
@@ -2031,8 +2201,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
* Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
* to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
*/
-void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
- enum hrtimer_mode mode)
+void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode)
{
/*
* Make the enqueue delivery mode check work on RT. If the sleeper
@@ -2048,8 +2217,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
-static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode)
+static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode)
{
/*
* On PREEMPT_RT enabled kernels hrtimers which are not explicitly
@@ -2085,8 +2254,8 @@ static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
* @clock_id: the clock to be used
* @mode: timer mode abs/rel
*/
-void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode)
+void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode)
{
debug_setup_on_stack(&sl->timer, clock_id, mode);
__hrtimer_setup_sleeper(sl, clock_id, mode);
@@ -2159,12 +2328,11 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
return ret;
}
-long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
- const clockid_t clockid)
+long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
- int ret = 0;
+ int ret;
hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
@@ -2203,8 +2371,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
- CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
@@ -2212,7 +2379,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+ struct old_timespec32 __user *, rmtp)
{
struct timespec64 tu;
@@ -2225,8 +2392,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
- CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
@@ -2236,14 +2402,13 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
int hrtimers_prepare_cpu(unsigned int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
- int i;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
clock_b->cpu_base = cpu_base;
seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
- timerqueue_init_head(&clock_b->active);
+ timerqueue_linked_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
@@ -2257,13 +2422,14 @@ int hrtimers_cpu_starting(unsigned int cpu)
/* Clear out any left over state from a CPU down operation */
cpu_base->active_bases = 0;
- cpu_base->hres_active = 0;
- cpu_base->hang_detected = 0;
+ cpu_base->hres_active = false;
+ cpu_base->hang_detected = false;
cpu_base->next_timer = NULL;
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->online = 1;
+ cpu_base->softirq_activated = false;
+ cpu_base->online = true;
return 0;
}
@@ -2272,20 +2438,20 @@ int hrtimers_cpu_starting(unsigned int cpu)
static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
struct hrtimer_clock_base *new_base)
{
+ struct timerqueue_linked_node *node;
struct hrtimer *timer;
- struct timerqueue_node *node;
- while ((node = timerqueue_getnext(&old_base->active))) {
- timer = container_of(node, struct hrtimer, node);
+ while ((node = timerqueue_linked_first(&old_base->active))) {
+ timer = hrtimer_from_timerqueue_node(node);
BUG_ON(hrtimer_callback_running(timer));
- debug_deactivate(timer);
+ debug_hrtimer_deactivate(timer);
/*
* Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -2295,13 +2461,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* sort out already expired timers and reprogram the
* event device.
*/
- enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true);
}
}
int hrtimers_cpu_dying(unsigned int dying_cpu)
{
- int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+ int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
struct hrtimer_cpu_base *old_base, *new_base;
old_base = this_cpu_ptr(&hrtimer_bases);
@@ -2314,16 +2480,14 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
raw_spin_lock(&old_base->lock);
raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- migrate_hrtimer_list(&old_base->clock_base[i],
- &new_base->clock_base[i]);
- }
+ for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]);
/* Tell the other CPU to retrigger the next event */
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
raw_spin_unlock(&new_base->lock);
- old_base->online = 0;
+ old_base->online = false;
raw_spin_unlock(&old_base->lock);
return 0;
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a5c7d15fce72..1c954f330dfe 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -32,7 +32,6 @@ static u64 jiffies_read(struct clocksource *cs)
static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
- .uncertainty_margin = 32 * NSEC_PER_MSEC,
.read = jiffies_read,
.mask = CLOCKSOURCE_MASK(32),
.mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
@@ -256,8 +255,6 @@ EXPORT_SYMBOL(proc_dointvec_jiffies);
int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
- if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ)
- return -EINVAL;
return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
do_proc_int_conv_userhz_jiffies);
}
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 652744e00eb4..4bca3f78c8ea 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -18,8 +18,9 @@
#include <linux/cred.h>
#include <linux/err.h>
#include <linux/mm.h>
+#include <linux/cleanup.h>
-#include <vdso/datapage.h>
+#include "namespace_internal.h"
ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
struct timens_offsets *ns_offsets)
@@ -93,8 +94,8 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
if (!ns)
goto fail_dec;
- ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!ns->vvar_page)
+ err = timens_vdso_alloc_vvar_page(ns);
+ if (err)
goto fail_free;
err = ns_common_init(ns);
@@ -109,7 +110,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
return ns;
fail_free_page:
- __free_page(ns->vvar_page);
+ timens_vdso_free_vvar_page(ns);
fail_free:
kfree(ns);
fail_dec:
@@ -138,117 +139,7 @@ struct time_namespace *copy_time_ns(u64 flags,
return clone_time_ns(user_ns, old_ns);
}
-static struct timens_offset offset_from_ts(struct timespec64 off)
-{
- struct timens_offset ret;
-
- ret.sec = off.tv_sec;
- ret.nsec = off.tv_nsec;
-
- return ret;
-}
-
-/*
- * A time namespace VVAR page has the same layout as the VVAR page which
- * contains the system wide VDSO data.
- *
- * For a normal task the VVAR pages are installed in the normal ordering:
- * VVAR
- * PVCLOCK
- * HVCLOCK
- * TIMENS <- Not really required
- *
- * Now for a timens task the pages are installed in the following order:
- * TIMENS
- * PVCLOCK
- * HVCLOCK
- * VVAR
- *
- * The check for vdso_clock->clock_mode is in the unlikely path of
- * the seq begin magic. So for the non-timens case most of the time
- * 'seq' is even, so the branch is not taken.
- *
- * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
- * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
- * update to finish and for 'seq' to become even anyway.
- *
- * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
- * enforces the time namespace handling path.
- */
-static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
- struct time_namespace *ns)
-{
- struct timens_offset *offset = vc->offset;
- struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
- struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
-
- vc->seq = 1;
- vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
- offset[CLOCK_MONOTONIC] = monotonic;
- offset[CLOCK_MONOTONIC_RAW] = monotonic;
- offset[CLOCK_MONOTONIC_COARSE] = monotonic;
- offset[CLOCK_BOOTTIME] = boottime;
- offset[CLOCK_BOOTTIME_ALARM] = boottime;
-}
-
-struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
- if (likely(vma->vm_mm == current->mm))
- return current->nsproxy->time_ns->vvar_page;
-
- /*
- * VM_PFNMAP | VM_IO protect .fault() handler from being called
- * through interfaces like /proc/$pid/mem or
- * process_vm_{readv,writev}() as long as there's no .access()
- * in special_mapping_vmops().
- * For more details check_vma_flags() and __access_remote_vm()
- */
-
- WARN(1, "vvar_page accessed remotely");
-
- return NULL;
-}
-
-/*
- * Protects possibly multiple offsets writers racing each other
- * and tasks entering the namespace.
- */
-static DEFINE_MUTEX(offset_lock);
-
-static void timens_set_vvar_page(struct task_struct *task,
- struct time_namespace *ns)
-{
- struct vdso_time_data *vdata;
- struct vdso_clock *vc;
- unsigned int i;
-
- if (ns == &init_time_ns)
- return;
-
- /* Fast-path, taken by every task in namespace except the first. */
- if (likely(ns->frozen_offsets))
- return;
-
- mutex_lock(&offset_lock);
- /* Nothing to-do: vvar_page has been already initialized. */
- if (ns->frozen_offsets)
- goto out;
-
- ns->frozen_offsets = true;
- vdata = page_address(ns->vvar_page);
- vc = vdata->clock_data;
-
- for (i = 0; i < CS_BASES; i++)
- timens_setup_vdso_clock_data(&vc[i], ns);
-
- if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
- for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
- timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
- }
-
-out:
- mutex_unlock(&offset_lock);
-}
+DEFINE_MUTEX(timens_offset_lock);
void free_time_ns(struct time_namespace *ns)
{
@@ -256,41 +147,39 @@ void free_time_ns(struct time_namespace *ns)
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_common_free(ns);
- __free_page(ns->vvar_page);
+ timens_vdso_free_vvar_page(ns);
/* Concurrent nstree traversal depends on a grace period. */
kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
{
- struct time_namespace *ns = NULL;
+ struct time_namespace *ns;
struct nsproxy *nsproxy;
- task_lock(task);
+ guard(task_lock)(task);
nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->time_ns;
- get_time_ns(ns);
- }
- task_unlock(task);
+ if (!nsproxy)
+ return NULL;
- return ns ? &ns->ns : NULL;
+ ns = nsproxy->time_ns;
+ get_time_ns(ns);
+ return &ns->ns;
}
static struct ns_common *timens_for_children_get(struct task_struct *task)
{
- struct time_namespace *ns = NULL;
+ struct time_namespace *ns;
struct nsproxy *nsproxy;
- task_lock(task);
+ guard(task_lock)(task);
nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->time_ns_for_children;
- get_time_ns(ns);
- }
- task_unlock(task);
+ if (!nsproxy)
+ return NULL;
- return ns ? &ns->ns : NULL;
+ ns = nsproxy->time_ns_for_children;
+ get_time_ns(ns);
+ return &ns->ns;
}
static void timens_put(struct ns_common *ns)
@@ -298,12 +187,6 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
-void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
-{
- timens_set_vvar_page(tsk, ns);
- vdso_join_timens(tsk, ns);
-}
-
static int timens_install(struct nsset *nsset, struct ns_common *new)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -367,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
{
- struct ns_common *ns;
- struct time_namespace *time_ns;
+ struct time_namespace *time_ns __free(time_ns) = NULL;
+ struct ns_common *ns = timens_for_children_get(p);
- ns = timens_for_children_get(p);
if (!ns)
return;
+
time_ns = to_time_ns(ns);
show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
- put_time_ns(time_ns);
}
int proc_timens_set_offset(struct file *file, struct task_struct *p,
struct proc_timens_offset *offsets, int noffsets)
{
- struct ns_common *ns;
- struct time_namespace *time_ns;
+ struct time_namespace *time_ns __free(time_ns) = NULL;
+ struct ns_common *ns = timens_for_children_get(p);
struct timespec64 tp;
- int i, err;
+ int i;
- ns = timens_for_children_get(p);
if (!ns)
return -ESRCH;
+
time_ns = to_time_ns(ns);
- if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
- put_time_ns(time_ns);
+ if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME))
return -EPERM;
- }
for (i = 0; i < noffsets; i++) {
struct proc_timens_offset *off = &offsets[i];
@@ -409,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
ktime_get_boottime_ts64(&tp);
break;
default:
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
- err = -ERANGE;
-
if (off->val.tv_sec > KTIME_SEC_MAX ||
off->val.tv_sec < -KTIME_SEC_MAX)
- goto out;
+ return -ERANGE;
tp = timespec64_add(tp, off->val);
/*
@@ -425,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
* still unreachable.
*/
if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
- goto out;
+ return -ERANGE;
}
- mutex_lock(&offset_lock);
- if (time_ns->frozen_offsets) {
- err = -EACCES;
- goto out_unlock;
- }
+ guard(mutex)(&timens_offset_lock);
+ if (time_ns->frozen_offsets)
+ return -EACCES;
- err = 0;
/* Don't report errors after this line */
for (i = 0; i < noffsets; i++) {
struct proc_timens_offset *off = &offsets[i];
@@ -452,12 +326,7 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
*offset = off->val;
}
-out_unlock:
- mutex_unlock(&offset_lock);
-out:
- put_time_ns(time_ns);
-
- return err;
+ return 0;
}
const struct proc_ns_operations timens_operations = {
diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h
new file mode 100644
index 000000000000..b37ba179f43b
--- /dev/null
+++ b/kernel/time/namespace_internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TIME_NAMESPACE_INTERNAL_H
+#define _TIME_NAMESPACE_INTERNAL_H
+
+#include <linux/mutex.h>
+
+struct time_namespace;
+
+/*
+ * Protects possibly multiple offsets writers racing each other
+ * and tasks entering the namespace.
+ */
+extern struct mutex timens_offset_lock;
+
+#ifdef CONFIG_TIME_NS_VDSO
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns);
+void timens_vdso_free_vvar_page(struct time_namespace *ns);
+#else /* !CONFIG_TIME_NS_VDSO */
+static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+ return 0;
+}
+static inline void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+}
+#endif /* CONFIG_TIME_NS_VDSO */
+
+#endif /* _TIME_NAMESPACE_INTERNAL_H */
diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c
new file mode 100644
index 000000000000..0d74d160eec9
--- /dev/null
+++ b/kernel/time/namespace_vdso.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/cleanup.h>
+#include <linux/mm.h>
+#include <linux/time_namespace.h>
+#include <linux/time.h>
+#include <linux/vdso_datastore.h>
+
+#include <vdso/clocksource.h>
+#include <vdso/datapage.h>
+
+#include "namespace_internal.h"
+
+static struct timens_offset offset_from_ts(struct timespec64 off)
+{
+ struct timens_offset ret;
+
+ ret.sec = off.tv_sec;
+ ret.nsec = off.tv_nsec;
+
+ return ret;
+}
+
+/*
+ * A time namespace VVAR page has the same layout as the VVAR page which
+ * contains the system wide VDSO data.
+ *
+ * For a normal task the VVAR pages are installed in the normal ordering:
+ * VVAR
+ * PVCLOCK
+ * HVCLOCK
+ * TIMENS <- Not really required
+ *
+ * Now for a timens task the pages are installed in the following order:
+ * TIMENS
+ * PVCLOCK
+ * HVCLOCK
+ * VVAR
+ *
+ * The check for vdso_clock->clock_mode is in the unlikely path of
+ * the seq begin magic. So for the non-timens case most of the time
+ * 'seq' is even, so the branch is not taken.
+ *
+ * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
+ * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
+ * update to finish and for 'seq' to become even anyway.
+ *
+ * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * enforces the time namespace handling path.
+ */
+static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
+ struct time_namespace *ns)
+{
+ struct timens_offset *offset = vc->offset;
+ struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
+ struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
+
+ vc->seq = 1;
+ vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
+ offset[CLOCK_MONOTONIC] = monotonic;
+ offset[CLOCK_MONOTONIC_RAW] = monotonic;
+ offset[CLOCK_MONOTONIC_COARSE] = monotonic;
+ offset[CLOCK_BOOTTIME] = boottime;
+ offset[CLOCK_BOOTTIME_ALARM] = boottime;
+}
+
+struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+
+ WARN(1, "vvar_page accessed remotely");
+
+ return NULL;
+}
+
+static void timens_set_vvar_page(struct task_struct *task,
+ struct time_namespace *ns)
+{
+ struct vdso_time_data *vdata;
+ struct vdso_clock *vc;
+ unsigned int i;
+
+ if (ns == &init_time_ns)
+ return;
+
+ /* Fast-path, taken by every task in namespace except the first. */
+ if (likely(ns->frozen_offsets))
+ return;
+
+ guard(mutex)(&timens_offset_lock);
+ /* Nothing to-do: vvar_page has been already initialized. */
+ if (ns->frozen_offsets)
+ return;
+
+ ns->frozen_offsets = true;
+ vdata = page_address(ns->vvar_page);
+ vc = vdata->clock_data;
+
+ for (i = 0; i < CS_BASES; i++)
+ timens_setup_vdso_clock_data(&vc[i], ns);
+
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+ for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+ timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+ }
+}
+
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will be re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_clock_data() for details.
+ */
+static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+ struct mm_struct *mm = task->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ guard(mmap_read_lock)(mm);
+ for_each_vma(vmi, vma) {
+ if (vma_is_special_mapping(vma, &vdso_vvar_mapping))
+ zap_vma(vma);
+ }
+ return 0;
+}
+
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+ timens_set_vvar_page(tsk, ns);
+ vdso_join_timens(tsk, ns);
+}
+
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+ ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!ns->vvar_page)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+ __free_page(ns->vvar_page);
+}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 413e2389f0a5..9331e1614124 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1092,7 +1092,7 @@ void exit_itimers(struct task_struct *tsk)
}
/*
- * There should be no timers on the ignored list. itimer_delete() has
+ * There should be no timers on the ignored list. posix_timer_delete() has
* mopped them up.
*/
if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers)))
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index a88b72b0f35e..51f6a1032c83 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = {
.set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
- CLOCK_EVT_FEAT_KTIME |
CLOCK_EVT_FEAT_HRTIMER,
.rating = 0,
.bound_on = -1,
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f63c65881364..7e57fa31ee26 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -76,8 +76,10 @@ const struct clock_event_device *tick_get_wakeup_device(int cpu)
*/
static void tick_broadcast_start_periodic(struct clock_event_device *bc)
{
- if (bc)
+ if (bc) {
+ bc->next_event_forced = 0;
tick_setup_periodic(bc, 1);
+ }
}
/*
@@ -403,6 +405,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
+ tick_broadcast_device.evtdev->next_event_forced = 0;
/* Handle spurious interrupts gracefully */
if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
@@ -696,6 +699,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
raw_spin_lock(&tick_broadcast_lock);
dev->next_event = KTIME_MAX;
+ tick_broadcast_device.evtdev->next_event_forced = 0;
next_event = KTIME_MAX;
cpumask_clear(tmpmask);
now = ktime_get();
@@ -1063,6 +1067,7 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
bc->event_handler = tick_handle_oneshot_broadcast;
+ bc->next_event_forced = 0;
bc->next_event = KTIME_MAX;
/*
@@ -1175,6 +1180,7 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
}
/* This moves the broadcast assignment to this CPU: */
+ bc->next_event_forced = 0;
clockevents_program_event(bc, bc->next_event, 1);
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index d305d8521896..6a9198a4279b 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
int cpu = smp_processor_id();
ktime_t next = dev->next_event;
+ dev->next_event_forced = 0;
tick_periodic(cpu);
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f7907fadd63f..cbbb87a0c6e7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -345,7 +345,7 @@ static bool check_tick_dependency(atomic_t *dep)
int val = atomic_read(dep);
if (likely(!tracepoint_enabled(tick_stop)))
- return !val;
+ return !!val;
if (val & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
@@ -864,19 +864,32 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+/* Simplified variant of hrtimer_forward_now() */
+static ktime_t tick_forward_now(ktime_t expires, ktime_t now)
+{
+ ktime_t delta = now - expires;
+
+ if (likely(delta < TICK_NSEC))
+ return expires + TICK_NSEC;
+
+ expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC);
+ if (expires > now)
+ return expires;
+ return expires + TICK_NSEC;
+}
+
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
- hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+ ktime_t expires = ts->last_tick;
- /* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
+ if (now >= expires)
+ expires = tick_forward_now(expires, now);
if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
} else {
- tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ hrtimer_set_expires(&ts->sched_timer, expires);
+ tick_program_event(expires, 1);
}
/*
@@ -1513,6 +1526,7 @@ static void tick_nohz_lowres_handler(struct clock_event_device *dev)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 0ba8e3c50d62..0d832317d576 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -365,20 +365,16 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
}
#endif
+#if HZ > MSEC_PER_SEC || (MSEC_PER_SEC % HZ)
/**
* jiffies_to_msecs - Convert jiffies to milliseconds
* @j: jiffies value
*
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases.
- *
* Return: milliseconds value
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+#if HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
@@ -390,7 +386,9 @@ unsigned int jiffies_to_msecs(const unsigned long j)
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);
+#endif
+#if (USEC_PER_SEC % HZ)
/**
* jiffies_to_usecs - Convert jiffies to microseconds
* @j: jiffies value
@@ -405,17 +403,14 @@ unsigned int jiffies_to_usecs(const unsigned long j)
*/
BUILD_BUG_ON(HZ > USEC_PER_SEC);
-#if !(USEC_PER_SEC % HZ)
- return (USEC_PER_SEC / HZ) * j;
-#else
-# if BITS_PER_LONG == 32
+#if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
-# else
+#else
return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
-# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);
+#endif
/**
* mktime64 - Converts date to seconds.
@@ -702,7 +697,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
*
* Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
*/
-u64 jiffies_64_to_clock_t(u64 x)
+notrace u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 91fa2003351c..c493a4010305 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -3,34 +3,30 @@
* Kernel timekeeping code and accessor functions. Based on code from
* timer.c, moved in commit 8524070b7982.
*/
-#include <linux/timekeeper_internal.h>
-#include <linux/module.h>
-#include <linux/interrupt.h>
+#include <linux/audit.h>
+#include <linux/clocksource.h>
+#include <linux/compiler.h>
+#include <linux/jiffies.h>
#include <linux/kobject.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/nmi.h>
-#include <linux/sched.h>
-#include <linux/sched/loadavg.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/random.h>
#include <linux/sched/clock.h>
+#include <linux/sched/loadavg.h>
+#include <linux/static_key.h>
+#include <linux/stop_machine.h>
#include <linux/syscore_ops.h>
-#include <linux/clocksource.h>
-#include <linux/jiffies.h>
+#include <linux/tick.h>
#include <linux/time.h>
#include <linux/timex.h>
-#include <linux/tick.h>
-#include <linux/stop_machine.h>
-#include <linux/pvclock_gtod.h>
-#include <linux/compiler.h>
-#include <linux/audit.h>
-#include <linux/random.h>
+#include <linux/timekeeper_internal.h>
#include <vdso/auxclock.h>
#include "tick-internal.h"
-#include "ntp_internal.h"
#include "timekeeping_internal.h"
+#include "ntp_internal.h"
#define TK_CLEAR_NTP (1 << 0)
#define TK_CLOCK_WAS_SET (1 << 1)
@@ -275,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}
+#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE
+#include <asm/clock_inlined.h>
+
+static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined);
+
/*
* tk_clock_read - atomic clocksource read() helper
*
@@ -288,12 +289,35 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* a read of the fast-timekeeper tkrs (which is protected by its own locking
* and update logic).
*/
-static inline u64 tk_clock_read(const struct tk_read_base *tkr)
+static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ if (static_branch_likely(&clocksource_read_inlined))
+ return arch_inlined_clocksource_read(clock);
+
+ return clock->read(clock);
+}
+
+static inline void clocksource_disable_inline_read(void)
+{
+ static_branch_disable(&clocksource_read_inlined);
+}
+
+static inline void clocksource_enable_inline_read(void)
+{
+ static_branch_enable(&clocksource_read_inlined);
+}
+#else
+static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
struct clocksource *clock = READ_ONCE(tkr->clock);
return clock->read(clock);
}
+static inline void clocksource_disable_inline_read(void) { }
+static inline void clocksource_enable_inline_read(void) { }
+#endif
/**
* tk_setup_internals - Set up internals to use clocksource clock.
@@ -367,6 +391,27 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
tk->skip_second_overflow = 0;
+
+ tk->cs_id = clock->id;
+
+ /* Coupled clockevent data */
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) &&
+ clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) {
+ /*
+ * Aim for an one hour maximum delta and use KHz to handle
+ * clocksources with a frequency above 4GHz correctly as
+ * the frequency argument of clocks_calc_mult_shift() is u32.
+ */
+ clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift,
+ NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000);
+ /*
+ * Initialize the conversion limit as the previous clocksource
+ * might have the same shift/mult pair so the quick check in
+ * tk_update_ns_to_cyc() fails to update it after a clocksource
+ * change leaving it effectivly zero.
+ */
+ tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult);
+ }
}
/* Timekeeper helper functions. */
@@ -375,7 +420,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
+static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
/* Calculate the delta since the last update_wall_time() */
u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
@@ -696,6 +741,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
+static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc)
+{
+ struct tk_read_base *tkrs = &tks->tkr_mono;
+ struct tk_read_base *tkrc = &tkc->tkr_mono;
+ unsigned int shift;
+
+ if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) ||
+ !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
+ return;
+
+ if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift)
+ return;
+ /*
+ * The conversion math is simple:
+ *
+ * CS::MULT (1 << NS_TO_CYC_SHIFT)
+ * --------------- = ----------------------
+ * (1 << CS:SHIFT) NS_TO_CYC_MULT
+ *
+ * Ergo:
+ *
+ * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT
+ *
+ * NS_TO_CYC_SHIFT has been set up in tk_setup_internals()
+ */
+ shift = tkrs->shift + tks->cs_ns_to_cyc_shift;
+ tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult);
+ tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult);
+}
+
/*
* Restore the shadow timekeeper from the real timekeeper.
*/
@@ -730,6 +805,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
if (tk->id == TIMEKEEPER_CORE) {
+ tk_update_ns_to_cyc(tk, &tkd->timekeeper);
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
@@ -784,6 +860,71 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk_update_coarse_nsecs(tk);
}
+/*
+ * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles
+ * @id: Clocksource ID which is required for validity
+ * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted
+ * @cycles: Pointer to storage for corresponding absolute cycles value
+ *
+ * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value
+ * based on the correlated clocksource of the clockevent device by using
+ * the base nanoseconds and cycles values of the last timekeeper update and
+ * converting the delta between @expires_ns and base nanoseconds to cycles.
+ *
+ * This only works for clockevent devices which are using a less than or
+ * equal comparator against the clocksource.
+ *
+ * Utilizing this avoids two clocksource reads for such devices, the
+ * ktime_get() in clockevents_program_event() to calculate the delta expiry
+ * value and the readout in the device::set_next_event() callback to
+ * convert the delta back to a absolute comparator value.
+ *
+ * Returns: True if @id matches the current clocksource ID, false otherwise
+ */
+bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct tk_read_base *tkrm = &tk->tkr_mono;
+ ktime_t base_ns, delta_ns, max_ns;
+ u64 base_cycles, delta_cycles;
+ unsigned int seq;
+ u32 mult, shift;
+
+ /*
+ * Racy check to avoid the seqcount overhead when ID does not match. If
+ * the relevant clocksource is installed concurrently, then this will
+ * just delay the switch over to this mechanism until the next event is
+ * programmed. If the ID is not matching the clock events code will use
+ * the regular relative set_next_event() callback as before.
+ */
+ if (data_race(tk->cs_id) != id)
+ return false;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ if (tk->cs_id != id)
+ return false;
+
+ base_cycles = tkrm->cycle_last;
+ base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift);
+
+ mult = tk->cs_ns_to_cyc_mult;
+ shift = tk->cs_ns_to_cyc_shift;
+ max_ns = tk->cs_ns_to_cyc_maxns;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ /* Prevent negative deltas and multiplication overflows */
+ delta_ns = min(expires_ns - base_ns, max_ns);
+ delta_ns = max(delta_ns, 0);
+
+ /* Convert to cycles */
+ delta_cycles = ((u64)delta_ns * mult) >> shift;
+ *cycles = base_cycles + delta_cycles;
+ return true;
+}
+
/**
* ktime_get_real_ts64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
@@ -848,7 +989,7 @@ u32 ktime_get_resolution_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
-static ktime_t *offsets[TK_OFFS_MAX] = {
+static const ktime_t *const offsets[TK_OFFS_MAX] = {
[TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
[TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
[TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
@@ -857,8 +998,9 @@ static ktime_t *offsets[TK_OFFS_MAX] = {
ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
- ktime_t base, *offset = offsets[offs];
+ ktime_t base;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -878,8 +1020,9 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
- ktime_t base, *offset = offsets[offs];
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
+ ktime_t base;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -902,7 +1045,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
*/
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
- ktime_t *offset = offsets[offs];
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
ktime_t tconv;
@@ -1631,7 +1774,19 @@ int timekeeping_notify(struct clocksource *clock)
if (tk->tkr_mono.clock == clock)
return 0;
+
+ /* Disable inlined reads accross the clocksource switch */
+ clocksource_disable_inline_read();
+
stop_machine(change_clocksource, clock, NULL);
+
+ /*
+ * If the clocksource has been selected and supports inlined reads
+ * enable the branch.
+ */
+ if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ)
+ clocksource_enable_inline_read();
+
tick_clock_notify();
return tk->tkr_mono.clock == clock ? 0 : -1;
}
@@ -2653,7 +2808,8 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux
if (aux_clock) {
/* Auxiliary clocks are similar to TAI and do not have leap seconds */
- if (txc->status & (STA_INS | STA_DEL))
+ if (txc->modes & ADJ_STATUS &&
+ txc->status & (STA_INS | STA_DEL))
return -EINVAL;
/* No TAI offset setting */
@@ -2661,7 +2817,8 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux
return -EINVAL;
/* No PPS support either */
- if (txc->status & (STA_PPSFREQ | STA_PPSTIME))
+ if (txc->modes & ADJ_STATUS &&
+ txc->status & (STA_PPSFREQ | STA_PPSTIME))
return -EINVAL;
}
@@ -2832,7 +2989,7 @@ static void tk_aux_update_clocksource(void)
continue;
timekeeping_forward_now(tks);
- tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
+ tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock);
timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
}
}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 543beba096c7..198d0608db74 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
ktime_t *offs_boot,
ktime_t *offs_tai);
+bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles);
+
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
extern void timekeeping_warp_clock(void);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 7e1e3bde6b8b..04d928c21aba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2319,6 +2319,7 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
*/
void timer_clear_idle(void)
{
+ int this_cpu = smp_processor_id();
/*
* We do this unlocked. The worst outcome is a remote pinned timer
* enqueue sending a pointless IPI, but taking the lock would just
@@ -2327,9 +2328,9 @@ void timer_clear_idle(void)
* path. Required for BASE_LOCAL only.
*/
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
- if (tick_nohz_full_cpu(smp_processor_id()))
+ if (tick_nohz_full_cpu(this_cpu))
__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
- trace_timer_base_idle(false, smp_processor_id());
+ trace_timer_base_idle(false, this_cpu);
/* Activate without holding the timer_base->lock */
tmigr_cpu_activate();
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 488e47e96e93..427d7ddea3af 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,7 +47,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function));
- SEQ_printf(m, ", S:%02x", timer->state);
+ SEQ_printf(m, ", S:%02x", timer->is_queued);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
@@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
(long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
}
-static void
-print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
- u64 now)
+static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
+ struct timerqueue_linked_node *curr;
struct hrtimer *timer, tmp;
unsigned long next = 0, i;
- struct timerqueue_node *curr;
unsigned long flags;
next_one:
@@ -72,13 +70,13 @@ next_one:
raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
- curr = timerqueue_getnext(&base->active);
+ curr = timerqueue_linked_first(&base->active);
/*
* Crude but we have to do this O(N*N) thing, because
* we have to unlock the base when printing:
*/
while (curr && i < next) {
- curr = timerqueue_iterate_next(curr);
+ curr = timerqueue_linked_next(curr);
i++;
}
@@ -103,8 +101,8 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
#ifdef CONFIG_HIGH_RES_TIMERS
- SEQ_printf(m, " .offset: %Lu nsecs\n",
- (unsigned long long) ktime_to_ns(base->offset));
+ SEQ_printf(m, " .offset: %Ld nsecs\n",
+ (long long) base->offset);
#endif
SEQ_printf(m, "active timers:\n");
print_active_timers(m, base, now + ktime_to_ns(base->offset));
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index c1ed0d5e8de6..155eeaea4113 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1559,8 +1559,6 @@ int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
int cpu;
- lockdep_assert_cpus_held();
-
if (!works)
return -ENOMEM;
if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
@@ -1570,6 +1568,7 @@ int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
* First set previously isolated CPUs as available (unisolate).
* This cpumask contains only CPUs that switched to available now.
*/
+ guard(cpus_read_lock)();
cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask);
cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask);
@@ -1626,7 +1625,6 @@ static int __init tmigr_init_isolation(void)
cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
/* Protect against RCU torture hotplug testing */
- guard(cpus_read_lock)();
return tmigr_isolated_exclude_cpumask(cpumask);
}
late_initcall(tmigr_init_isolation);
diff --git a/kernel/torture.c b/kernel/torture.c
index ec3370986976..62c1ac777694 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -93,7 +93,7 @@ int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode
{
ktime_t hto = baset_ns;
- if (trsp)
+ if (trsp && fuzzt_ns)
hto += torture_random(trsp) % fuzzt_ns;
set_current_state(TASK_IDLE);
return schedule_hrtimeout(&hto, mode);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 49de13cae428..e130da35808f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1281,4 +1281,18 @@ config HIST_TRIGGERS_DEBUG
source "kernel/trace/rv/Kconfig"
+config TRACE_REMOTE
+ bool
+
+config SIMPLE_RING_BUFFER
+ bool
+
+config TRACE_REMOTE_TEST
+ tristate "Test module for remote tracing"
+ select TRACE_REMOTE
+ select SIMPLE_RING_BUFFER
+ help
+ This trace remote includes a ring-buffer writer implementation using
+ "simple_ring_buffer". This is solely intending for testing.
+
endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 04096c21d06b..4d4229e5eec4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING) += trace_pid.o
+obj-$(CONFIG_TRACER_SNAPSHOT) += trace_snapshot.o
obj-$(CONFIG_TRACING) += pid_list.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
@@ -128,4 +129,63 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
obj-$(CONFIG_RV) += rv/
+obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o
+obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o
+obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
+
+#
+# simple_ring_buffer is used by the pKVM hypervisor which does not have access
+# to all kernel symbols. Fail the build if forbidden symbols are found.
+#
+# undefsyms_base generates a set of compiler and tooling-generated symbols that can
+# safely be ignored for simple_ring_buffer.
+#
+filechk_undefsyms_base = \
+ echo '$(pound)include <linux/atomic.h>'; \
+ echo '$(pound)include <linux/string.h>'; \
+ echo '$(pound)include <asm/page.h>'; \
+ echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \
+ echo 'void undefsyms_base(void *p, int n);'; \
+ echo 'void undefsyms_base(void *p, int n) {'; \
+ echo ' char buffer[256] = { 0 };'; \
+ echo ' u32 u = 0;'; \
+ echo ' memset((char * volatile)page, 8, PAGE_SIZE);'; \
+ echo ' memset((char * volatile)buffer, 8, sizeof(buffer));'; \
+ echo ' memcpy((void * volatile)p, buffer, sizeof(buffer));'; \
+ echo ' cmpxchg((u32 * volatile)&u, 0, 8);'; \
+ echo ' WARN_ON(n == 0xdeadbeef);'; \
+ echo '}'
+
+$(obj)/undefsyms_base.c: FORCE
+ $(call filechk,undefsyms_base)
+
+clean-files += undefsyms_base.c
+
+$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c
+
+targets += undefsyms_base.o
+
+# Ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when
+# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags
+# because it is not linked into vmlinux.
+KASAN_SANITIZE_undefsyms_base.o := y
+
+UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
+ __msan simple_ring_buffer \
+ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
+
+quiet_cmd_check_undefined = NM $<
+ cmd_check_undefined = \
+ undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \
+ if [ -n "$$undefsyms" ]; then \
+ echo "Unexpected symbols in $<:" >&2; \
+ echo "$$undefsyms" >&2; \
+ false; \
+ fi
+
+$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
+ $(call if_changed,check_undefined)
+
+always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked
+
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 30259dcaa838..8cd2520b4c99 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -383,8 +383,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
cpu = raw_smp_processor_id();
if (blk_tracer) {
- tracing_record_cmdline(current);
-
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
switch (bt->version) {
@@ -419,6 +417,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (!event)
return;
+ tracing_record_cmdline(current);
switch (bt->version) {
case 1:
record_blktrace_event(ring_buffer_event_data(event),
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9bc0dfd235af..af7079aa0f36 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2454,8 +2454,10 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_kprobe_multi_link *kmulti_link;
+ bool has_cookies;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ has_cookies = !!kmulti_link->cookies;
seq_printf(seq,
"kprobe_cnt:\t%u\n"
@@ -2467,7 +2469,7 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
for (int i = 0; i < kmulti_link->cnt; i++) {
seq_printf(seq,
"%llu\t %pS\n",
- kmulti_link->cookies[i],
+ has_cookies ? kmulti_link->cookies[i] : 0,
(void *)kmulti_link->addrs[i]);
}
}
@@ -2750,6 +2752,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!is_kprobe_multi(prog))
return -EINVAL;
+ /* kprobe_multi is not allowed to be sleepable. */
+ if (prog->sleepable)
+ return -EINVAL;
+
/* Writing to context is not allowed for kprobes. */
if (prog->aux->kprobe_write_ctx)
return -EINVAL;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index dcadf1d23b8a..56d145017902 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -450,8 +450,6 @@ static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops
used += FPROBE_HEADER_SIZE_IN_LONG + size_words;
}
}
- if (used < reserved_words)
- memset(fgraph_data + used, 0, reserved_words - used);
/* If any exit_handler is set, data must be used. */
return used != 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 827fb9a0bf0d..b2611de3f594 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6404,6 +6404,7 @@ int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash)
new_filter_hash = old_filter_hash;
}
} else {
+ guard(mutex)(&ftrace_lock);
err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
/*
* new_filter_hash is dup-ed, so we need to release it anyway,
@@ -6530,6 +6531,7 @@ int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash)
ops->func_hash->filter_hash = NULL;
}
} else {
+ guard(mutex)(&ftrace_lock);
err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
/*
* new_filter_hash is dup-ed, so we need to release it anyway,
@@ -6604,9 +6606,9 @@ int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, b
if (!orig_hash)
goto unlock;
- /* Enable the tmp_ops to have the same functions as the direct ops */
+ /* Enable the tmp_ops to have the same functions as the hash object. */
ftrace_ops_init(&tmp_ops);
- tmp_ops.func_hash = ops->func_hash;
+ tmp_ops.func_hash->filter_hash = hash;
err = register_ftrace_function_nolock(&tmp_ops);
if (err)
@@ -6839,7 +6841,8 @@ bool ftrace_filter_param __initdata;
static int __init set_ftrace_notrace(char *str)
{
ftrace_filter_param = true;
- strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_notrace_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_notrace=", set_ftrace_notrace);
@@ -6847,7 +6850,8 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
static int __init set_ftrace_filter(char *str)
{
ftrace_filter_param = true;
- strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_filter_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_filter=", set_ftrace_filter);
@@ -6859,14 +6863,16 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
static int __init set_graph_function(char *str)
{
- strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_graph_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_filter=", set_graph_function);
static int __init set_graph_notrace_function(char *str)
{
- strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ trace_append_boot_param(ftrace_graph_notrace_buf, str, ',',
+ FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_notrace=", set_graph_notrace_function);
@@ -8611,6 +8617,7 @@ ftrace_pid_follow_sched_process_fork(void *data,
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
+ guard(preempt)();
pid_list = rcu_dereference_sched(tr->function_pids);
trace_filter_add_remove_task(pid_list, self, task);
@@ -8624,6 +8631,7 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
+ guard(preempt)();
pid_list = rcu_dereference_sched(tr->function_pids);
trace_filter_add_remove_task(pid_list, NULL, task);
@@ -9263,6 +9271,15 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr)
* @addrs array, which needs to be big enough to store at least @cnt
* addresses.
*
+ * For a single symbol (cnt == 1), uses kallsyms_lookup_name() which
+ * performs an O(log N) binary search via the sorted kallsyms index.
+ * This avoids the full O(N) linear scan over all kernel symbols that
+ * the multi-symbol path requires.
+ *
+ * For multiple symbols, uses a single-pass linear scan via
+ * kallsyms_on_each_symbol() with binary search into the sorted input
+ * array.
+ *
* Returns: 0 if all provided symbols are found, -ESRCH otherwise.
*/
int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
@@ -9270,6 +9287,19 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a
struct kallsyms_data args;
int found_all;
+ /* Fast path: single symbol uses O(log N) binary search */
+ if (cnt == 1) {
+ addrs[0] = kallsyms_lookup_name(sorted_syms[0]);
+ if (addrs[0] && ftrace_location(addrs[0]))
+ return 0;
+ /*
+ * Binary lookup can fail for duplicate symbol names
+ * where the first match is not ftrace-instrumented.
+ * Retry with linear scan.
+ */
+ }
+
+ /* Batch path: single-pass O(N) linear scan */
memset(addrs, 0, sizeof(*addrs) * cnt);
args.addrs = addrs;
args.syms = sorted_syms;
diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c
new file mode 100644
index 000000000000..6c1b7701ddae
--- /dev/null
+++ b/kernel/trace/remote_test.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/simple_ring_buffer.h>
+#include <linux/trace_remote.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+
+#define REMOTE_EVENT_INCLUDE_FILE kernel/trace/remote_test_events.h
+#include <trace/define_remote_events.h>
+
+static DEFINE_PER_CPU(struct simple_rb_per_cpu *, simple_rbs);
+static struct trace_buffer_desc *remote_test_buffer_desc;
+
+/*
+ * The trace_remote lock already serializes accesses from the trace_remote_callbacks.
+ * However write_event can still race with load/unload.
+ */
+static DEFINE_MUTEX(simple_rbs_lock);
+
+static int remote_test_load_simple_rb(int cpu, struct ring_buffer_desc *rb_desc)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+ struct simple_buffer_page *bpages;
+ int ret = -ENOMEM;
+
+ cpu_buffer = kmalloc_obj(*cpu_buffer);
+ if (!cpu_buffer)
+ return ret;
+
+ bpages = kmalloc_objs(*bpages, rb_desc->nr_page_va);
+ if (!bpages)
+ goto err_free_cpu_buffer;
+
+ ret = simple_ring_buffer_init(cpu_buffer, bpages, rb_desc);
+ if (ret)
+ goto err_free_bpages;
+
+ scoped_guard(mutex, &simple_rbs_lock) {
+ WARN_ON(*per_cpu_ptr(&simple_rbs, cpu));
+ *per_cpu_ptr(&simple_rbs, cpu) = cpu_buffer;
+ }
+
+ return 0;
+
+err_free_bpages:
+ kfree(bpages);
+
+err_free_cpu_buffer:
+ kfree(cpu_buffer);
+
+ return ret;
+}
+
+static void remote_test_unload_simple_rb(int cpu)
+{
+ struct simple_rb_per_cpu *cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ struct simple_buffer_page *bpages;
+
+ if (!cpu_buffer)
+ return;
+
+ guard(mutex)(&simple_rbs_lock);
+
+ bpages = cpu_buffer->bpages;
+ simple_ring_buffer_unload(cpu_buffer);
+ kfree(bpages);
+ kfree(cpu_buffer);
+ *per_cpu_ptr(&simple_rbs, cpu) = NULL;
+}
+
+static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ struct trace_buffer_desc *desc;
+ size_t desc_size;
+ int cpu, ret;
+
+ if (WARN_ON(remote_test_buffer_desc))
+ return ERR_PTR(-EINVAL);
+
+ desc_size = trace_buffer_desc_size(size, num_possible_cpus());
+ if (desc_size == SIZE_MAX) {
+ ret = -E2BIG;
+ goto err;
+ }
+
+ desc = kmalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = trace_remote_alloc_buffer(desc, desc_size, size, cpu_possible_mask);
+ if (ret)
+ goto err_free_desc;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc) {
+ ret = remote_test_load_simple_rb(rb_desc->cpu, rb_desc);
+ if (ret)
+ goto err_unload;
+ }
+
+ remote_test_buffer_desc = desc;
+
+ return remote_test_buffer_desc;
+
+err_unload:
+ for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ remote_test_unload_simple_rb(rb_desc->cpu);
+ trace_remote_free_buffer(remote_test_buffer_desc);
+
+err_free_desc:
+ kfree(desc);
+
+err:
+ return ERR_PTR(ret);
+}
+
+static void remote_test_unload(struct trace_buffer_desc *desc, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ if (WARN_ON(desc != remote_test_buffer_desc))
+ return;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
+ remote_test_unload_simple_rb(rb_desc->cpu);
+
+ remote_test_buffer_desc = NULL;
+ trace_remote_free_buffer(desc);
+ kfree(desc);
+}
+
+static int remote_test_enable_tracing(bool enable, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ if (!remote_test_buffer_desc)
+ return -ENODEV;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ WARN_ON(simple_ring_buffer_enable_tracing(*per_cpu_ptr(&simple_rbs, rb_desc->cpu),
+ enable));
+ return 0;
+}
+
+static int remote_test_swap_reader_page(unsigned int cpu, void *unused)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+
+ if (cpu >= NR_CPUS)
+ return -EINVAL;
+
+ cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ if (!cpu_buffer)
+ return -EINVAL;
+
+ return simple_ring_buffer_swap_reader_page(cpu_buffer);
+}
+
+static int remote_test_reset(unsigned int cpu, void *unused)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+
+ if (cpu >= NR_CPUS)
+ return -EINVAL;
+
+ cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ if (!cpu_buffer)
+ return -EINVAL;
+
+ return simple_ring_buffer_reset(cpu_buffer);
+}
+
+static int remote_test_enable_event(unsigned short id, bool enable, void *unused)
+{
+ if (id != REMOTE_TEST_EVENT_ID)
+ return -EINVAL;
+
+ /*
+ * Let's just use the struct remote_event enabled field that is turned on and off by
+ * trace_remote. This is a bit racy but good enough for a simple test module.
+ */
+ return 0;
+}
+
+static ssize_t
+write_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *pos)
+{
+ struct remote_event_format_selftest *evt_test;
+ struct simple_rb_per_cpu *cpu_buffer;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&simple_rbs_lock);
+
+ if (!remote_event_selftest.enabled)
+ return -ENODEV;
+
+ guard(preempt)();
+
+ cpu_buffer = *this_cpu_ptr(&simple_rbs);
+ if (!cpu_buffer)
+ return -ENODEV;
+
+ evt_test = simple_ring_buffer_reserve(cpu_buffer,
+ sizeof(struct remote_event_format_selftest),
+ trace_clock_global());
+ if (!evt_test)
+ return -ENODEV;
+
+ evt_test->hdr.id = REMOTE_TEST_EVENT_ID;
+ evt_test->id = val;
+
+ simple_ring_buffer_commit(cpu_buffer);
+
+ return cnt;
+}
+
+static const struct file_operations write_event_fops = {
+ .write = write_event_write,
+};
+
+static int remote_test_init_tracefs(struct dentry *d, void *unused)
+{
+ return tracefs_create_file("write_event", 0200, d, NULL, &write_event_fops) ?
+ 0 : -ENOMEM;
+}
+
+static struct trace_remote_callbacks trace_remote_callbacks = {
+ .init = remote_test_init_tracefs,
+ .load_trace_buffer = remote_test_load,
+ .unload_trace_buffer = remote_test_unload,
+ .enable_tracing = remote_test_enable_tracing,
+ .swap_reader_page = remote_test_swap_reader_page,
+ .reset = remote_test_reset,
+ .enable_event = remote_test_enable_event,
+};
+
+static int __init remote_test_init(void)
+{
+ return trace_remote_register("test", &trace_remote_callbacks, NULL,
+ &remote_event_selftest, 1);
+}
+
+module_init(remote_test_init);
+
+MODULE_DESCRIPTION("Test module for the trace remote interface");
+MODULE_AUTHOR("Vincent Donnefort");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/remote_test_events.h b/kernel/trace/remote_test_events.h
new file mode 100644
index 000000000000..26b93b3406fc
--- /dev/null
+++ b/kernel/trace/remote_test_events.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define REMOTE_TEST_EVENT_ID 1
+
+REMOTE_EVENT(selftest, REMOTE_TEST_EVENT_ID,
+ RE_STRUCT(
+ re_field(u64, id)
+ ),
+ RE_PRINTK("id=%llu", __entry->id)
+);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f16f053ef77d..cef49f8871d2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ring_buffer_types.h>
#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
@@ -157,23 +158,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-
-#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
-#define RB_ALIGNMENT 4U
-#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-
-#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
-# define RB_FORCE_8BYTE_ALIGNMENT 0
-# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
-#else
-# define RB_FORCE_8BYTE_ALIGNMENT 1
-# define RB_ARCH_ALIGNMENT 8U
-#endif
-
-#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
-
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -316,10 +300,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_online_buffer_cpu(buffer, cpu) \
for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
-#define TS_SHIFT 27
-#define TS_MASK ((1ULL << TS_SHIFT) - 1)
-#define TS_DELTA_TEST (~TS_MASK)
-
static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -338,12 +318,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
#define RB_MISSED_MASK (3 << 30)
-struct buffer_data_page {
- u64 time_stamp; /* page time stamp */
- local_t commit; /* write committed index */
- unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
-};
-
struct buffer_data_read_page {
unsigned order; /* order of the page */
struct buffer_data_page *data; /* actual data, stored in this page */
@@ -437,14 +411,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
return dpage;
}
-/*
- * We need to fit the time_stamp delta into 27 bits.
- */
-static inline bool test_time_stamp(u64 delta)
-{
- return !!(delta & TS_DELTA_TEST);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
@@ -555,10 +521,12 @@ struct ring_buffer_per_cpu {
unsigned int mapped;
unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
- unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct buffer_page **subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
struct ring_buffer_cpu_meta *ring_meta;
+ struct ring_buffer_remote *remote;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -581,6 +549,8 @@ struct trace_buffer {
struct ring_buffer_per_cpu **buffers;
+ struct ring_buffer_remote *remote;
+
struct hlist_node node;
u64 (*clock)(void);
@@ -627,16 +597,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq
(unsigned int)sizeof(field.commit),
(unsigned int)is_signed_type(long));
- trace_seq_printf(s, "\tfield: int overwrite;\t"
+ trace_seq_printf(s, "\tfield: char overwrite;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), commit),
1,
- (unsigned int)is_signed_type(long));
+ (unsigned int)is_signed_type(char));
trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
- (unsigned int)buffer->subbuf_size,
+ (unsigned int)(buffer ? buffer->subbuf_size :
+ PAGE_SIZE - BUF_PAGE_HDR_SIZE),
(unsigned int)is_signed_type(char));
return !trace_seq_has_overflowed(s);
@@ -2053,7 +2024,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
entries += ret;
entry_bytes += local_read(&head_page->page->commit);
- local_set(&cpu_buffer->head_page->entries, ret);
+ local_set(&head_page->entries, ret);
if (head_page == cpu_buffer->commit_page)
break;
@@ -2238,6 +2209,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
+static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu)
+{
+ struct ring_buffer_desc *desc, *end;
+ size_t len;
+ int i;
+
+ if (!trace_desc)
+ return NULL;
+
+ if (cpu >= trace_desc->nr_cpus)
+ return NULL;
+
+ end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len);
+ desc = __first_ring_buffer_desc(trace_desc);
+ len = struct_size(desc, page_va, desc->nr_page_va);
+ desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu));
+
+ if (desc < end && desc->cpu == cpu)
+ return desc;
+
+ /* Missing CPUs, need to linear search */
+ for_each_ring_buffer_desc(desc, i, trace_desc) {
+ if (desc->cpu == cpu)
+ return desc;
+ }
+
+ return NULL;
+}
+
+static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id)
+{
+ return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id];
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
@@ -2245,6 +2250,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
+ struct ring_buffer_desc *desc = NULL;
long i;
/*
@@ -2273,6 +2279,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
if (buffer->range_addr_start)
meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+ if (buffer->remote) {
+ desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu);
+ if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1)))
+ return -EINVAL;
+ }
+
for (i = 0; i < nr_pages; i++) {
bpage = alloc_cpu_page(cpu_buffer->cpu);
@@ -2297,6 +2309,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
bpage->id = i + 1;
+ } else if (desc) {
+ void *p = ring_buffer_desc_page(desc, i + 1);
+
+ if (WARN_ON(!p))
+ goto free_pages;
+
+ bpage->page = p;
+ bpage->range = 1; /* bpage->page can't be freed */
+ bpage->id = i + 1;
+ cpu_buffer->subbuf_ids[i + 1] = bpage;
} else {
int order = cpu_buffer->buffer->subbuf_order;
bpage->page = alloc_cpu_data(cpu_buffer->cpu, order);
@@ -2394,6 +2416,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (cpu_buffer->ring_meta->head_buffer)
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
+ } else if (buffer->remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu);
+
+ if (!desc)
+ goto fail_free_reader;
+
+ cpu_buffer->remote = buffer->remote;
+ cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va;
+ cpu_buffer->nr_pages = nr_pages;
+ cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1,
+ sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL);
+ if (!cpu_buffer->subbuf_ids)
+ goto fail_free_reader;
+
+ /* Remote buffers are read-only and immutable */
+ atomic_inc(&cpu_buffer->record_disabled);
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id);
+ if (!bpage->page)
+ goto fail_free_reader;
+
+ bpage->range = 1;
+ cpu_buffer->subbuf_ids[0] = bpage;
} else {
int order = cpu_buffer->buffer->subbuf_order;
bpage->page = alloc_cpu_data(cpu, order);
@@ -2453,6 +2499,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
irq_work_sync(&cpu_buffer->irq_work.work);
+ if (cpu_buffer->remote)
+ kfree(cpu_buffer->subbuf_ids);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -2475,7 +2524,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
unsigned long scratch_size,
- struct lock_class_key *key)
+ struct lock_class_key *key,
+ struct ring_buffer_remote *remote)
{
struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
@@ -2515,6 +2565,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ cpu = raw_smp_processor_id();
+
/* If start/end are specified, then that overrides size */
if (start && end) {
unsigned long buffers_start;
@@ -2570,6 +2622,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
buffer->range_addr_end = end;
rb_range_meta_init(buffer, nr_pages, scratch_size);
+ } else if (remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu);
+
+ buffer->remote = remote;
+ /* The writer is remote. This ring-buffer is read-only */
+ atomic_inc(&buffer->record_disabled);
+ nr_pages = desc->nr_page_va - 1;
+ if (nr_pages < 2)
+ goto fail_free_buffers;
} else {
/* need at least two pages */
@@ -2578,7 +2639,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
nr_pages = 2;
}
- cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
@@ -2620,7 +2680,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
/* Default buffer page size - one system page */
- return alloc_buffer(size, flags, 0, 0, 0, 0, key);
+ return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
@@ -2647,7 +2707,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag
struct lock_class_key *key)
{
return alloc_buffer(size, flags, order, start, start + range_size,
- scratch_size, key);
+ scratch_size, key, NULL);
+}
+
+/**
+ * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote
+ * @remote: Contains a description of the ring-buffer pages and remote callbacks.
+ * @key: ring buffer reader_lock_key.
+ */
+struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote);
}
void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size)
@@ -4435,18 +4506,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
if (ret < 0) {
if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n",
+ cpu_buffer->cpu, ts, delta,
+ cpu_buffer->buffer->clock);
goto out;
}
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS",
cpu_buffer->cpu,
ts + info->delta, info->ts, info->delta,
info->before, info->after,
- full ? " (full)" : "", show_interrupt_level());
+ full ? " (full)" : "", show_interrupt_level(),
+ cpu_buffer->buffer->clock);
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -5274,10 +5347,61 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
}
EXPORT_SYMBOL_GPL(ring_buffer_overruns);
+static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries));
+ local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun));
+ local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched));
+ local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost));
+
+ return rb_num_of_entries(cpu_buffer);
+}
+
+static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *next, *orig;
+ int retry = 3;
+
+ orig = next = cpu_buffer->head_page;
+ rb_inc_page(&next);
+
+ /* Run after the writer */
+ while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) {
+ rb_inc_page(&next);
+
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+ rb_inc_page(&cpu_buffer->head_page);
+ rb_set_list_to_head(cpu_buffer->head_page->list.prev);
+
+ if (cpu_buffer->head_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+
+ orig = cpu_buffer->commit_page = cpu_buffer->head_page;
+ retry = 3;
+
+ while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) {
+ rb_inc_page(&next);
+ rb_inc_page(&cpu_buffer->commit_page);
+
+ if (cpu_buffer->commit_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+}
+
static void rb_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ if (cpu_buffer->remote) {
+ rb_read_remote_meta_page(cpu_buffer);
+ rb_update_remote_head(cpu_buffer);
+ }
+
/* Iterator usage is expected to have record disabled */
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
@@ -5428,7 +5552,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
}
static struct buffer_page *
-rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last;
+
+ if (!rb_read_remote_meta_page(cpu_buffer))
+ return NULL;
+
+ /* More to read on the reader page */
+ if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) {
+ if (!cpu_buffer->reader_page->read)
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
+ return cpu_buffer->reader_page;
+ }
+
+ prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu,
+ cpu_buffer->remote->priv));
+ /* nr_pages doesn't include the reader page */
+ if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages))
+ return NULL;
+
+ new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(prev_reader == new_reader);
+
+ prev_head = new_reader; /* New reader was also the previous head */
+ new_head = prev_head;
+ rb_inc_page(&new_head);
+ last = prev_head;
+ rb_dec_page(&last);
+
+ /* Clear the old HEAD flag */
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+
+ prev_reader->list.next = prev_head->list.next;
+ prev_reader->list.prev = prev_head->list.prev;
+
+ /* Swap prev_reader with new_reader */
+ last->list.next = &prev_reader->list;
+ new_head->list.prev = &prev_reader->list;
+
+ new_reader->list.prev = &new_reader->list;
+ new_reader->list.next = &new_head->list;
+
+ /* Reactivate the HEAD flag */
+ rb_set_list_to_head(&last->list);
+
+ cpu_buffer->head_page = new_head;
+ cpu_buffer->reader_page = new_reader;
+ cpu_buffer->pages = &new_head->list;
+ cpu_buffer->read_stamp = new_reader->page->time_stamp;
+ cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events;
+
+ return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL;
+}
+
+static struct buffer_page *
+__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
@@ -5598,6 +5780,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
return reader;
}
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) :
+ __rb_get_reader_page(cpu_buffer);
+}
+
static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_event *event;
@@ -6154,6 +6343,8 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
meta->entries = local_read(&cpu_buffer->entries);
meta->overrun = local_read(&cpu_buffer->overrun);
meta->read = cpu_buffer->read;
+ meta->pages_lost = local_read(&cpu_buffer->pages_lost);
+ meta->pages_touched = local_read(&cpu_buffer->pages_touched);
/* Some archs do not have data cache coherency between kernel and user-space */
flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
@@ -6164,6 +6355,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *page;
+ if (cpu_buffer->remote) {
+ if (!cpu_buffer->remote->reset)
+ return;
+
+ cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv);
+ rb_read_remote_meta_page(cpu_buffer);
+
+ /* Read related values, not covered by the meta-page */
+ local_set(&cpu_buffer->pages_read, 0);
+ cpu_buffer->read = 0;
+ cpu_buffer->read_bytes = 0;
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->reader_page->read = 0;
+
+ return;
+ }
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
@@ -6394,6 +6602,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ if (rb_read_remote_meta_page(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+
+ return 0;
+ }
+
+ guard(cpus_read_lock)();
+
+ /*
+ * Make sure all the ring buffers are up to date before we start reading
+ * them.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ rb_read_remote_meta_page(cpu_buffer);
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (rb_num_of_entries(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -6632,6 +6880,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int commit;
unsigned int read;
u64 save_timestamp;
+ bool force_memcpy;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -1;
@@ -6669,6 +6918,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
+ force_memcpy = cpu_buffer->mapped || cpu_buffer->remote;
+
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
@@ -6678,7 +6929,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
*/
if (read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page ||
- cpu_buffer->mapped) {
+ force_memcpy) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -7034,7 +7285,7 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
}
static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long *subbuf_ids)
+ struct buffer_page **subbuf_ids)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
@@ -7043,7 +7294,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
int id = 0;
id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
- subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+ subbuf_ids[id++] = cpu_buffer->reader_page;
cnt++;
first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
@@ -7053,7 +7304,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
if (WARN_ON(id >= nr_subbufs))
break;
- subbuf_ids[id] = (unsigned long)subbuf->page;
+ subbuf_ids[id] = subbuf;
rb_inc_page(&subbuf);
id++;
@@ -7062,7 +7313,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
WARN_ON(cnt != nr_subbufs);
- /* install subbuf ID to kern VA translation */
+ /* install subbuf ID to bpage translation */
cpu_buffer->subbuf_ids = subbuf_ids;
meta->meta_struct_len = sizeof(*meta);
@@ -7218,13 +7469,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
}
while (p < nr_pages) {
+ struct buffer_page *subbuf;
struct page *page;
int off = 0;
if (WARN_ON_ONCE(s >= nr_subbufs))
return -EINVAL;
- page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+ subbuf = cpu_buffer->subbuf_ids[s];
+ page = virt_to_page((void *)subbuf->page);
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
@@ -7251,10 +7504,11 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
struct vm_area_struct *vma)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags, *subbuf_ids;
+ struct buffer_page **subbuf_ids;
+ unsigned long flags;
int err;
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote)
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
@@ -7275,7 +7529,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
if (err)
return err;
- /* subbuf_ids include the reader while nr_pages does not */
+ /* subbuf_ids includes the reader while nr_pages does not */
subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
if (!subbuf_ids) {
rb_free_meta_page(cpu_buffer);
@@ -7310,6 +7564,27 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
return err;
}
+/*
+ * This is called when a VMA is duplicated (e.g., on fork()) to increment
+ * the user_mapped counter without remapping pages.
+ */
+void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask)))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(mutex)(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->user_mapped)
+ __rb_inc_dec_mapped(cpu_buffer, true);
+ else
+ WARN(1, "Unexpected buffer stat, it should be mapped");
+}
+
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -7447,6 +7722,12 @@ out:
return 0;
}
+static void rb_cpu_sync(void *data)
+{
+ /* Not really needed, but documents what is happening */
+ smp_rmb();
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
@@ -7485,7 +7766,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
cpu);
return -ENOMEM;
}
- smp_wmb();
+
+ /*
+ * Ensure trace_buffer readers observe the newly allocated
+ * ring_buffer_per_cpu before they check the cpumask. Instead of using a
+ * read barrier for all readers, send an IPI.
+ */
+ if (unlikely(system_state == SYSTEM_RUNNING)) {
+ on_each_cpu(rb_cpu_sync, NULL, 1);
+ /* Not really needed, but documents what is happening */
+ smp_wmb();
+ }
+
cpumask_set_cpu(cpu, buffer->cpumask);
return 0;
}
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 5b4be87ba59d..3884b14df375 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -23,6 +23,19 @@ config LTL_MON_EVENTS_ID
config RV_LTL_MONITOR
bool
+config RV_HA_MONITOR
+ bool
+
+config HA_MON_EVENTS_IMPLICIT
+ select DA_MON_EVENTS_IMPLICIT
+ select RV_HA_MONITOR
+ bool
+
+config HA_MON_EVENTS_ID
+ select DA_MON_EVENTS_ID
+ select RV_HA_MONITOR
+ bool
+
menuconfig RV
bool "Runtime Verification"
select TRACING
@@ -65,6 +78,11 @@ source "kernel/trace/rv/monitors/pagefault/Kconfig"
source "kernel/trace/rv/monitors/sleep/Kconfig"
# Add new rtapp monitors here
+source "kernel/trace/rv/monitors/stall/Kconfig"
+source "kernel/trace/rv/monitors/deadline/Kconfig"
+source "kernel/trace/rv/monitors/nomiss/Kconfig"
+# Add new deadline monitors here
+
# Add new monitors here
config RV_REACTORS
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 750e4ad6fa0f..94498da35b37 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -17,6 +17,9 @@ obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o
obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
+obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
+obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
+obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/deadline/Kconfig b/kernel/trace/rv/monitors/deadline/Kconfig
new file mode 100644
index 000000000000..38804a6ad91d
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/Kconfig
@@ -0,0 +1,10 @@
+config RV_MON_DEADLINE
+ depends on RV
+ bool "deadline monitor"
+ help
+ Collection of monitors to check the deadline scheduler and server
+ behave according to specifications. Enable this to enable all
+ scheduler specification supported by the current kernel.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_deadline.rst
diff --git a/kernel/trace/rv/monitors/deadline/deadline.c b/kernel/trace/rv/monitors/deadline/deadline.c
new file mode 100644
index 000000000000..d566d4542ebf
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/deadline.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <linux/kallsyms.h>
+
+#define MODULE_NAME "deadline"
+
+#include "deadline.h"
+
+struct rv_monitor rv_deadline = {
+ .name = "deadline",
+ .description = "container for several deadline scheduler specifications.",
+ .enable = NULL,
+ .disable = NULL,
+ .reset = NULL,
+ .enabled = 0,
+};
+
+/* Used by other monitors */
+struct sched_class *rv_ext_sched_class;
+
+static int __init register_deadline(void)
+{
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) {
+ rv_ext_sched_class = (void *)kallsyms_lookup_name("ext_sched_class");
+ if (!rv_ext_sched_class)
+ pr_warn("rv: Missing ext_sched_class, monitors may not work.\n");
+ }
+ return rv_register_monitor(&rv_deadline, NULL);
+}
+
+static void __exit unregister_deadline(void)
+{
+ rv_unregister_monitor(&rv_deadline);
+}
+
+module_init(register_deadline);
+module_exit(unregister_deadline);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("deadline: container for several deadline scheduler specifications.");
diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h
new file mode 100644
index 000000000000..0bbfd2543329
--- /dev/null
+++ b/kernel/trace/rv/monitors/deadline/deadline.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/sched/deadline.h>
+#include <asm/syscall.h>
+#include <uapi/linux/sched/types.h>
+#include <trace/events/sched.h>
+
+/*
+ * Dummy values if not available
+ */
+#ifndef __NR_sched_setscheduler
+#define __NR_sched_setscheduler -__COUNTER__
+#endif
+#ifndef __NR_sched_setattr
+#define __NR_sched_setattr -__COUNTER__
+#endif
+
+extern struct rv_monitor rv_deadline;
+/* Initialised when registering the deadline container */
+extern struct sched_class *rv_ext_sched_class;
+
+/*
+ * If both have dummy values, the syscalls are not supported and we don't even
+ * need to register the handler.
+ */
+static inline bool should_skip_syscall_handle(void)
+{
+ return __NR_sched_setattr < 0 && __NR_sched_setscheduler < 0;
+}
+
+/*
+ * is_supported_type - return true if @type is supported by the deadline monitors
+ */
+static inline bool is_supported_type(u8 type)
+{
+ return type == DL_TASK || type == DL_SERVER_FAIR || type == DL_SERVER_EXT;
+}
+
+/*
+ * is_server_type - return true if @type is a supported server
+ */
+static inline bool is_server_type(u8 type)
+{
+ return is_supported_type(type) && type != DL_TASK;
+}
+
+/*
+ * Use negative numbers for the server.
+ * Currently only one fair server per CPU, may change in the future.
+ */
+#define fair_server_id(cpu) (-cpu)
+#define ext_server_id(cpu) (-cpu - num_possible_cpus())
+#define NO_SERVER_ID (-2 * num_possible_cpus())
+/*
+ * Get a unique id used for dl entities
+ *
+ * The cpu is not required for tasks as the pid is used there, if this function
+ * is called on a dl_se that for sure corresponds to a task, DL_TASK can be
+ * used in place of cpu.
+ * We need the cpu for servers as it is provided in the tracepoint and we
+ * cannot easily retrieve it from the dl_se (requires the struct rq definition).
+ */
+static inline int get_entity_id(struct sched_dl_entity *dl_se, int cpu, u8 type)
+{
+ if (dl_server(dl_se) && type != DL_TASK) {
+ if (type == DL_SERVER_FAIR)
+ return fair_server_id(cpu);
+ if (type == DL_SERVER_EXT)
+ return ext_server_id(cpu);
+ return NO_SERVER_ID;
+ }
+ return dl_task_of(dl_se)->pid;
+}
+
+static inline bool task_is_scx_enabled(struct task_struct *tsk)
+{
+ return IS_ENABLED(CONFIG_SCHED_CLASS_EXT) &&
+ tsk->sched_class == rv_ext_sched_class;
+}
+
+/* Expand id and target as arguments for da functions */
+#define EXPAND_ID(dl_se, cpu, type) get_entity_id(dl_se, cpu, type), dl_se
+#define EXPAND_ID_TASK(tsk) get_entity_id(&tsk->dl, task_cpu(tsk), DL_TASK), &tsk->dl
+
+static inline u8 get_server_type(struct task_struct *tsk)
+{
+ if (tsk->policy == SCHED_NORMAL || tsk->policy == SCHED_EXT ||
+ tsk->policy == SCHED_BATCH || tsk->policy == SCHED_IDLE)
+ return task_is_scx_enabled(tsk) ? DL_SERVER_EXT : DL_SERVER_FAIR;
+ return DL_OTHER;
+}
+
+static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out)
+{
+ size_t size = offsetofend(struct sched_attr, sched_flags);
+ struct sched_attr __user *uattr, attr;
+ int new_policy = -1, ret;
+ unsigned long args[6];
+
+ switch (id) {
+ case __NR_sched_setscheduler:
+ syscall_get_arguments(current, regs, args);
+ *pid_out = args[0];
+ new_policy = args[1];
+ break;
+ case __NR_sched_setattr:
+ syscall_get_arguments(current, regs, args);
+ *pid_out = args[0];
+ uattr = (struct sched_attr __user *)args[1];
+ /*
+ * Just copy up to sched_flags, we are not interested after that
+ */
+ ret = copy_struct_from_user(&attr, size, uattr, size);
+ if (ret)
+ return ret;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ return -EINVAL;
+ new_policy = attr.sched_policy;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return new_policy & ~SCHED_RESET_ON_FORK;
+}
+
+/* Helper functions requiring DA/HA utilities */
+#ifdef RV_MON_TYPE
+
+/*
+ * get_fair_server - get the fair server associated to a task
+ *
+ * If the task is a boosted task, the server is available in the task_struct,
+ * otherwise grab the dl entity saved for the CPU where the task is enqueued.
+ * This function assumes the task is enqueued somewhere.
+ */
+static inline struct sched_dl_entity *get_server(struct task_struct *tsk, u8 type)
+{
+ if (tsk->dl_server && get_server_type(tsk) == type)
+ return tsk->dl_server;
+ if (type == DL_SERVER_FAIR)
+ return da_get_target_by_id(fair_server_id(task_cpu(tsk)));
+ if (type == DL_SERVER_EXT)
+ return da_get_target_by_id(ext_server_id(task_cpu(tsk)));
+ return NULL;
+}
+
+/*
+ * Initialise monitors for all tasks and pre-allocate the storage for servers.
+ * This is necessary since we don't have access to the servers here and
+ * allocation can cause deadlocks from their tracepoints. We can only fill
+ * pre-initialised storage from there.
+ */
+static inline int init_storage(bool skip_tasks)
+{
+ struct task_struct *g, *p;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (!da_create_empty_storage(fair_server_id(cpu)))
+ goto fail;
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT) &&
+ !da_create_empty_storage(ext_server_id(cpu)))
+ goto fail;
+ }
+
+ if (skip_tasks)
+ return 0;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (p->policy == SCHED_DEADLINE) {
+ if (!da_create_storage(EXPAND_ID_TASK(p), NULL)) {
+ read_unlock(&tasklist_lock);
+ goto fail;
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return 0;
+
+fail:
+ da_monitor_destroy();
+ return -ENOMEM;
+}
+
+static void __maybe_unused handle_newtask(void *data, struct task_struct *task, u64 flags)
+{
+ /* Might be superfluous as tasks are not started with this policy.. */
+ if (task->policy == SCHED_DEADLINE)
+ da_create_storage(EXPAND_ID_TASK(task), NULL);
+}
+
+static void __maybe_unused handle_exit(void *data, struct task_struct *p, bool group_dead)
+{
+ if (p->policy == SCHED_DEADLINE)
+ da_destroy_storage(get_entity_id(&p->dl, DL_TASK, DL_TASK));
+}
+
+#endif
diff --git a/kernel/trace/rv/monitors/nomiss/Kconfig b/kernel/trace/rv/monitors/nomiss/Kconfig
new file mode 100644
index 000000000000..e1886c3a0dd9
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_NOMISS
+ depends on RV
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ depends on RV_MON_DEADLINE
+ default y
+ select HA_MON_EVENTS_ID
+ bool "nomiss monitor"
+ help
+ Monitor to ensure dl entities run to completion before their deadiline.
+ This monitor is part of the deadline monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_deadline.rst
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c
new file mode 100644
index 000000000000..31f90f3638d8
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "nomiss"
+
+#include <uapi/linux/sched/types.h>
+#include <trace/events/syscalls.h>
+#include <trace/events/sched.h>
+#include <trace/events/task.h>
+#include <rv_trace.h>
+
+#define RV_MON_TYPE RV_MON_PER_OBJ
+#define HA_TIMER_TYPE HA_TIMER_WHEEL
+/* The start condition is on sched_switch, it's dangerous to allocate there */
+#define DA_SKIP_AUTO_ALLOC
+typedef struct sched_dl_entity *monitor_target;
+#include "nomiss.h"
+#include <rv/ha_monitor.h>
+#include <monitors/deadline/deadline.h>
+
+/*
+ * User configurable deadline threshold. If the total utilisation of deadline
+ * tasks is larger than 1, they are only guaranteed bounded tardiness. See
+ * Documentation/scheduler/sched-deadline.rst for more details.
+ * The minimum tardiness without sched_feat(HRTICK_DL) is 1 tick to accommodate
+ * for throttle enforced on the next tick.
+ */
+static u64 deadline_thresh = TICK_NSEC;
+module_param(deadline_thresh, ullong, 0644);
+#define DEADLINE_NS(ha_mon) (ha_get_target(ha_mon)->dl_deadline + deadline_thresh)
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns)
+{
+ if (env == clk_nomiss)
+ return ha_get_clk_ns(ha_mon, env, time_ns);
+ else if (env == is_constr_dl_nomiss)
+ return !dl_is_implicit(ha_get_target(ha_mon));
+ else if (env == is_defer_nomiss)
+ return ha_get_target(ha_mon)->dl_defer;
+ return ENV_INVALID_VALUE;
+}
+
+static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns)
+{
+ if (env == clk_nomiss)
+ ha_reset_clk_ns(ha_mon, env, time_ns);
+}
+
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == ready_nomiss)
+ return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == running_nomiss)
+ return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns);
+ return true;
+}
+
+static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == next_state)
+ return;
+ if (curr_state == ready_nomiss)
+ ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (curr_state == running_nomiss)
+ ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+}
+
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ bool res = true;
+
+ if (curr_state == ready_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == ready_nomiss && event == dl_throttle_nomiss)
+ res = ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull;
+ else if (curr_state == idle_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == running_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == sleeping_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ else if (curr_state == sleeping_nomiss && event == dl_throttle_nomiss)
+ res = ha_get_env(ha_mon, is_constr_dl_nomiss, time_ns) == 1ull ||
+ ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull;
+ else if (curr_state == throttled_nomiss && event == dl_replenish_nomiss)
+ ha_reset_env(ha_mon, clk_nomiss, time_ns);
+ return res;
+}
+
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (next_state == curr_state && event != dl_replenish_nomiss)
+ return;
+ if (next_state == ready_nomiss)
+ ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (next_state == running_nomiss)
+ ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns);
+ else if (curr_state == ready_nomiss)
+ ha_cancel_timer(ha_mon);
+ else if (curr_state == running_nomiss)
+ ha_cancel_timer(ha_mon);
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns);
+
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+ return true;
+}
+
+static void handle_dl_replenish(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ if (is_supported_type(type))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_replenish_nomiss);
+}
+
+static void handle_dl_throttle(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ if (is_supported_type(type))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_throttle_nomiss);
+}
+
+static void handle_dl_server_stop(void *data, struct sched_dl_entity *dl_se,
+ int cpu, u8 type)
+{
+ /*
+ * This isn't the standard use of da_handle_start_run_event since this
+ * event cannot only occur from the initial state.
+ * It is fine to use here because it always brings to a known state and
+ * the fact we "pretend" the transition starts from the initial state
+ * has no side effect.
+ */
+ if (is_supported_type(type))
+ da_handle_start_run_event(EXPAND_ID(dl_se, cpu, type), dl_server_stop_nomiss);
+}
+
+static inline void handle_server_switch(struct task_struct *next, int cpu, u8 type)
+{
+ struct sched_dl_entity *dl_se = get_server(next, type);
+
+ if (dl_se && is_idle_task(next))
+ da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_server_idle_nomiss);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ int cpu = task_cpu(next);
+
+ if (prev_state != TASK_RUNNING && !preempt && prev->policy == SCHED_DEADLINE)
+ da_handle_event(EXPAND_ID_TASK(prev), sched_switch_suspend_nomiss);
+ if (next->policy == SCHED_DEADLINE)
+ da_handle_start_run_event(EXPAND_ID_TASK(next), sched_switch_in_nomiss);
+
+ /*
+ * The server is available in next only if the next task is boosted,
+ * otherwise we need to retrieve it.
+ * Here the server continues in the state running/armed until actually
+ * stopped, this works since we continue expecting a throttle.
+ */
+ if (next->dl_server)
+ da_handle_start_event(EXPAND_ID(next->dl_server, cpu,
+ get_server_type(next)),
+ sched_switch_in_nomiss);
+ else {
+ handle_server_switch(next, cpu, DL_SERVER_FAIR);
+ if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT))
+ handle_server_switch(next, cpu, DL_SERVER_EXT);
+ }
+}
+
+static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
+{
+ struct task_struct *p;
+ int new_policy = -1;
+ pid_t pid = 0;
+
+ new_policy = extract_params(regs, id, &pid);
+ if (new_policy < 0)
+ return;
+ guard(rcu)();
+ p = pid ? find_task_by_vpid(pid) : current;
+ if (unlikely(!p) || new_policy == p->policy)
+ return;
+
+ if (p->policy == SCHED_DEADLINE)
+ da_reset(EXPAND_ID_TASK(p));
+ else if (new_policy == SCHED_DEADLINE)
+ da_create_or_get(EXPAND_ID_TASK(p));
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *tsk)
+{
+ if (tsk->policy == SCHED_DEADLINE)
+ da_handle_event(EXPAND_ID_TASK(tsk), sched_wakeup_nomiss);
+}
+
+static int enable_nomiss(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ retval = init_storage(false);
+ if (retval)
+ return retval;
+ rv_attach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish);
+ rv_attach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle);
+ rv_attach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop);
+ rv_attach_trace_probe("nomiss", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
+ if (!should_skip_syscall_handle())
+ rv_attach_trace_probe("nomiss", sys_enter, handle_sys_enter);
+ rv_attach_trace_probe("nomiss", task_newtask, handle_newtask);
+ rv_attach_trace_probe("nomiss", sched_process_exit, handle_exit);
+
+ return 0;
+}
+
+static void disable_nomiss(void)
+{
+ rv_this.enabled = 0;
+
+ /* Those are RCU writers, detach earlier hoping to close a bit faster */
+ rv_detach_trace_probe("nomiss", task_newtask, handle_newtask);
+ rv_detach_trace_probe("nomiss", sched_process_exit, handle_exit);
+ if (!should_skip_syscall_handle())
+ rv_detach_trace_probe("nomiss", sys_enter, handle_sys_enter);
+
+ rv_detach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish);
+ rv_detach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle);
+ rv_detach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop);
+ rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "nomiss",
+ .description = "dl entities run to completion before their deadline.",
+ .enable = enable_nomiss,
+ .disable = disable_nomiss,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_nomiss(void)
+{
+ return rv_register_monitor(&rv_this, &rv_deadline);
+}
+
+static void __exit unregister_nomiss(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_nomiss);
+module_exit(unregister_nomiss);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("nomiss: dl entities run to completion before their deadline.");
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.h b/kernel/trace/rv/monitors/nomiss/nomiss.h
new file mode 100644
index 000000000000..3d1b436194d7
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of nomiss automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME nomiss
+
+enum states_nomiss {
+ ready_nomiss,
+ idle_nomiss,
+ running_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ state_max_nomiss,
+};
+
+#define INVALID_STATE state_max_nomiss
+
+enum events_nomiss {
+ dl_replenish_nomiss,
+ dl_server_idle_nomiss,
+ dl_server_stop_nomiss,
+ dl_throttle_nomiss,
+ sched_switch_in_nomiss,
+ sched_switch_suspend_nomiss,
+ sched_wakeup_nomiss,
+ event_max_nomiss,
+};
+
+enum envs_nomiss {
+ clk_nomiss,
+ is_constr_dl_nomiss,
+ is_defer_nomiss,
+ env_max_nomiss,
+ env_max_stored_nomiss = is_constr_dl_nomiss,
+};
+
+_Static_assert(env_max_stored_nomiss <= MAX_HA_ENV_LEN, "Not enough slots");
+#define HA_CLK_NS
+
+struct automaton_nomiss {
+ char *state_names[state_max_nomiss];
+ char *event_names[event_max_nomiss];
+ char *env_names[env_max_nomiss];
+ unsigned char function[state_max_nomiss][event_max_nomiss];
+ unsigned char initial_state;
+ bool final_states[state_max_nomiss];
+};
+
+static const struct automaton_nomiss automaton_nomiss = {
+ .state_names = {
+ "ready",
+ "idle",
+ "running",
+ "sleeping",
+ "throttled",
+ },
+ .event_names = {
+ "dl_replenish",
+ "dl_server_idle",
+ "dl_server_stop",
+ "dl_throttle",
+ "sched_switch_in",
+ "sched_switch_suspend",
+ "sched_wakeup",
+ },
+ .env_names = {
+ "clk",
+ "is_constr_dl",
+ "is_defer",
+ },
+ .function = {
+ {
+ ready_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ ready_nomiss,
+ },
+ {
+ ready_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ INVALID_STATE,
+ },
+ {
+ running_nomiss,
+ idle_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ sleeping_nomiss,
+ running_nomiss,
+ },
+ {
+ ready_nomiss,
+ sleeping_nomiss,
+ sleeping_nomiss,
+ throttled_nomiss,
+ running_nomiss,
+ INVALID_STATE,
+ ready_nomiss,
+ },
+ {
+ ready_nomiss,
+ throttled_nomiss,
+ INVALID_STATE,
+ throttled_nomiss,
+ INVALID_STATE,
+ throttled_nomiss,
+ throttled_nomiss,
+ },
+ },
+ .initial_state = ready_nomiss,
+ .final_states = { 1, 0, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss_trace.h b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h
new file mode 100644
index 000000000000..42e7efaca4e7
--- /dev/null
+++ b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_NOMISS
+DEFINE_EVENT(event_da_monitor_id, event_nomiss,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_nomiss,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_nomiss,
+ TP_PROTO(int id, char *state, char *event, char *env),
+ TP_ARGS(id, state, event, env));
+#endif /* CONFIG_RV_MON_NOMISS */
diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig
index 561d32da572b..6d02e239b684 100644
--- a/kernel/trace/rv/monitors/opid/Kconfig
+++ b/kernel/trace/rv/monitors/opid/Kconfig
@@ -2,18 +2,13 @@
#
config RV_MON_OPID
depends on RV
- depends on TRACE_IRQFLAGS
- depends on TRACE_PREEMPT_TOGGLE
depends on RV_MON_SCHED
- default y if PREEMPT_RT
- select DA_MON_EVENTS_IMPLICIT
+ default y
+ select HA_MON_EVENTS_IMPLICIT
bool "opid monitor"
help
Monitor to ensure operations like wakeup and need resched occur with
- interrupts and preemption disabled or during IRQs, where preemption
- may not be disabled explicitly.
-
- This monitor is unstable on !PREEMPT_RT, say N unless you are testing it.
+ interrupts and preemption disabled.
For further information, see:
Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
index 25a40e90fa40..4594c7c46601 100644
--- a/kernel/trace/rv/monitors/opid/opid.c
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -10,94 +10,63 @@
#define MODULE_NAME "opid"
#include <trace/events/sched.h>
-#include <trace/events/irq.h>
-#include <trace/events/preemptirq.h>
#include <rv_trace.h>
#include <monitors/sched/sched.h>
#define RV_MON_TYPE RV_MON_PER_CPU
#include "opid.h"
-#include <rv/da_monitor.h>
+#include <rv/ha_monitor.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/trace/irq_vectors.h>
-
-static void handle_vector_irq_entry(void *data, int vector)
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns)
{
- da_handle_event(irq_entry_opid);
-}
-
-static void attach_vector_irq(void)
-{
- rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry);
- if (IS_ENABLED(CONFIG_IRQ_WORK))
- rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry);
- if (IS_ENABLED(CONFIG_SMP)) {
- rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry);
- rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry);
- rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry);
+ if (env == irq_off_opid)
+ return irqs_disabled();
+ else if (env == preempt_off_opid) {
+ /*
+ * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
+ * preemption (adding one to the preempt_count). Since we are
+ * interested in the preempt_count at the time the tracepoint was
+ * hit, we consider 1 as still enabled.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ return (preempt_count() & PREEMPT_MASK) > 1;
+ return true;
}
+ return ENV_INVALID_VALUE;
}
-static void detach_vector_irq(void)
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
{
- rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry);
- if (IS_ENABLED(CONFIG_IRQ_WORK))
- rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry);
- if (IS_ENABLED(CONFIG_SMP)) {
- rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry);
- rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry);
- rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry);
- }
+ bool res = true;
+
+ if (curr_state == any_opid && event == sched_need_resched_opid)
+ res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull;
+ else if (curr_state == any_opid && event == sched_waking_opid)
+ res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull &&
+ ha_get_env(ha_mon, preempt_off_opid, time_ns) == 1ull;
+ return res;
}
-#else
-/* We assume irq_entry tracepoints are sufficient on other architectures */
-static void attach_vector_irq(void) { }
-static void detach_vector_irq(void) { }
-#endif
-
-static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
{
- da_handle_event(irq_disable_opid);
-}
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
-static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_event(irq_enable_opid);
-}
-
-static void handle_irq_entry(void *data, int irq, struct irqaction *action)
-{
- da_handle_event(irq_entry_opid);
-}
-
-static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_event(preempt_disable_opid);
-}
-
-static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_event(preempt_enable_opid);
+ return true;
}
static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif)
{
- /* The monitor's intitial state is not in_irq */
- if (this_cpu_read(hardirq_context))
- da_handle_event(sched_need_resched_opid);
- else
- da_handle_start_event(sched_need_resched_opid);
+ da_handle_start_run_event(sched_need_resched_opid);
}
static void handle_sched_waking(void *data, struct task_struct *p)
{
- /* The monitor's intitial state is not in_irq */
- if (this_cpu_read(hardirq_context))
- da_handle_event(sched_waking_opid);
- else
- da_handle_start_event(sched_waking_opid);
+ da_handle_start_run_event(sched_waking_opid);
}
static int enable_opid(void)
@@ -108,14 +77,8 @@ static int enable_opid(void)
if (retval)
return retval;
- rv_attach_trace_probe("opid", irq_disable, handle_irq_disable);
- rv_attach_trace_probe("opid", irq_enable, handle_irq_enable);
- rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry);
- rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable);
- rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable);
rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
rv_attach_trace_probe("opid", sched_waking, handle_sched_waking);
- attach_vector_irq();
return 0;
}
@@ -124,14 +87,8 @@ static void disable_opid(void)
{
rv_this.enabled = 0;
- rv_detach_trace_probe("opid", irq_disable, handle_irq_disable);
- rv_detach_trace_probe("opid", irq_enable, handle_irq_enable);
- rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry);
- rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable);
- rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
- detach_vector_irq();
da_monitor_destroy();
}
diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h
index 092992514970..fb0aa4c28aa6 100644
--- a/kernel/trace/rv/monitors/opid/opid.h
+++ b/kernel/trace/rv/monitors/opid/opid.h
@@ -8,30 +8,31 @@
#define MONITOR_NAME opid
enum states_opid {
- disabled_opid,
- enabled_opid,
- in_irq_opid,
- irq_disabled_opid,
- preempt_disabled_opid,
+ any_opid,
state_max_opid,
};
#define INVALID_STATE state_max_opid
enum events_opid {
- irq_disable_opid,
- irq_enable_opid,
- irq_entry_opid,
- preempt_disable_opid,
- preempt_enable_opid,
sched_need_resched_opid,
sched_waking_opid,
event_max_opid,
};
+enum envs_opid {
+ irq_off_opid,
+ preempt_off_opid,
+ env_max_opid,
+ env_max_stored_opid = irq_off_opid,
+};
+
+_Static_assert(env_max_stored_opid <= MAX_HA_ENV_LEN, "Not enough slots");
+
struct automaton_opid {
char *state_names[state_max_opid];
char *event_names[event_max_opid];
+ char *env_names[env_max_opid];
unsigned char function[state_max_opid][event_max_opid];
unsigned char initial_state;
bool final_states[state_max_opid];
@@ -39,68 +40,19 @@ struct automaton_opid {
static const struct automaton_opid automaton_opid = {
.state_names = {
- "disabled",
- "enabled",
- "in_irq",
- "irq_disabled",
- "preempt_disabled",
+ "any",
},
.event_names = {
- "irq_disable",
- "irq_enable",
- "irq_entry",
- "preempt_disable",
- "preempt_enable",
"sched_need_resched",
"sched_waking",
},
+ .env_names = {
+ "irq_off",
+ "preempt_off",
+ },
.function = {
- {
- INVALID_STATE,
- preempt_disabled_opid,
- disabled_opid,
- INVALID_STATE,
- irq_disabled_opid,
- disabled_opid,
- disabled_opid,
- },
- {
- irq_disabled_opid,
- INVALID_STATE,
- INVALID_STATE,
- preempt_disabled_opid,
- enabled_opid,
- INVALID_STATE,
- INVALID_STATE,
- },
- {
- INVALID_STATE,
- enabled_opid,
- in_irq_opid,
- INVALID_STATE,
- INVALID_STATE,
- in_irq_opid,
- in_irq_opid,
- },
- {
- INVALID_STATE,
- enabled_opid,
- in_irq_opid,
- disabled_opid,
- INVALID_STATE,
- irq_disabled_opid,
- INVALID_STATE,
- },
- {
- disabled_opid,
- INVALID_STATE,
- INVALID_STATE,
- INVALID_STATE,
- enabled_opid,
- INVALID_STATE,
- INVALID_STATE,
- },
+ { any_opid, any_opid },
},
- .initial_state = disabled_opid,
- .final_states = { 0, 1, 0, 0, 0 },
+ .initial_state = any_opid,
+ .final_states = { 1 },
};
diff --git a/kernel/trace/rv/monitors/opid/opid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h
index 3df6ff955c30..b04005b64208 100644
--- a/kernel/trace/rv/monitors/opid/opid_trace.h
+++ b/kernel/trace/rv/monitors/opid/opid_trace.h
@@ -12,4 +12,8 @@ DEFINE_EVENT(event_da_monitor, event_opid,
DEFINE_EVENT(error_da_monitor, error_opid,
TP_PROTO(char *state, char *event),
TP_ARGS(state, event));
+
+DEFINE_EVENT(error_env_da_monitor, error_env_opid,
+ TP_PROTO(char *state, char *event, char *env),
+ TP_ARGS(state, event, env));
#endif /* CONFIG_RV_MON_OPID */
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index c1347da69e9d..8dfe5ec13e19 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -49,6 +49,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo
ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
}
@@ -63,6 +64,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo
ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
if (strstarts(task->comm, "migration/"))
ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
@@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
break;
}
break;
+#ifdef __NR_epoll_wait
+ case __NR_epoll_wait:
+ ltl_atom_update(current, LTL_EPOLL_WAIT, true);
+ break;
+#endif
}
}
@@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
}
diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h
index 2ab46fd218d2..95dc2727c059 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.h
+++ b/kernel/trace/rv/monitors/sleep/sleep.h
@@ -15,6 +15,7 @@ enum ltl_atom {
LTL_ABORT_SLEEP,
LTL_BLOCK_ON_RT_MUTEX,
LTL_CLOCK_NANOSLEEP,
+ LTL_EPOLL_WAIT,
LTL_FUTEX_LOCK_PI,
LTL_FUTEX_WAIT,
LTL_KERNEL_THREAD,
@@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
"ab_sl",
"bl_on_rt_mu",
"cl_na",
+ "ep_wa",
"fu_lo_pi",
"fu_wa",
"ker_th",
@@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
{
- bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
- bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
- bool val40 = task_is_rcu || task_is_migration;
- bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
- bool val41 = futex_lock_pi || val40;
- bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
- bool val5 = block_on_rt_mutex || val41;
- bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
- bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
- bool val32 = abort_sleep || kthread_should_stop;
bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
- bool val33 = woken_by_nmi || val32;
bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
- bool val34 = woken_by_hardirq || val33;
bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
mon->atoms);
- bool val14 = woken_by_equal_or_higher_prio || val34;
bool wake = test_bit(LTL_WAKE, mon->atoms);
- bool val13 = !wake;
- bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
- bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
- bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
- bool val25 = nanosleep_timer_abstime && val24;
- bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
- bool val18 = clock_nanosleep && val25;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
- bool val9 = futex_wait || val18;
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val42 = task_is_rcu || task_is_migration;
+ bool val43 = futex_lock_pi || val42;
+ bool val5 = block_on_rt_mutex || val43;
+ bool val34 = abort_sleep || kthread_should_stop;
+ bool val35 = woken_by_nmi || val34;
+ bool val36 = woken_by_hardirq || val35;
+ bool val14 = woken_by_equal_or_higher_prio || val36;
+ bool val13 = !wake;
+ bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool val27 = nanosleep_timer_abstime && val26;
+ bool val18 = clock_nanosleep && val27;
+ bool val20 = val18 || epoll_wait;
+ bool val9 = futex_wait || val20;
bool val11 = val9 || kernel_thread;
- bool sleep = test_bit(LTL_SLEEP, mon->atoms);
bool val2 = !sleep;
- bool rt = test_bit(LTL_RT, mon->atoms);
bool val1 = !rt;
bool val3 = val1 || val2;
@@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
static void
ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
{
- bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
- bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
- bool val40 = task_is_rcu || task_is_migration;
- bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
- bool val41 = futex_lock_pi || val40;
- bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
- bool val5 = block_on_rt_mutex || val41;
- bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
- bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
- bool val32 = abort_sleep || kthread_should_stop;
bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
- bool val33 = woken_by_nmi || val32;
bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
- bool val34 = woken_by_hardirq || val33;
bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
mon->atoms);
- bool val14 = woken_by_equal_or_higher_prio || val34;
bool wake = test_bit(LTL_WAKE, mon->atoms);
- bool val13 = !wake;
- bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
- bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
- bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
- bool val25 = nanosleep_timer_abstime && val24;
- bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
- bool val18 = clock_nanosleep && val25;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
- bool val9 = futex_wait || val18;
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val42 = task_is_rcu || task_is_migration;
+ bool val43 = futex_lock_pi || val42;
+ bool val5 = block_on_rt_mutex || val43;
+ bool val34 = abort_sleep || kthread_should_stop;
+ bool val35 = woken_by_nmi || val34;
+ bool val36 = woken_by_hardirq || val35;
+ bool val14 = woken_by_equal_or_higher_prio || val36;
+ bool val13 = !wake;
+ bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool val27 = nanosleep_timer_abstime && val26;
+ bool val18 = clock_nanosleep && val27;
+ bool val20 = val18 || epoll_wait;
+ bool val9 = futex_wait || val20;
bool val11 = val9 || kernel_thread;
- bool sleep = test_bit(LTL_SLEEP, mon->atoms);
bool val2 = !sleep;
- bool rt = test_bit(LTL_RT, mon->atoms);
bool val1 = !rt;
bool val3 = val1 || val2;
diff --git a/kernel/trace/rv/monitors/stall/Kconfig b/kernel/trace/rv/monitors/stall/Kconfig
new file mode 100644
index 000000000000..6f846b642544
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_STALL
+ depends on RV
+ select HA_MON_EVENTS_ID
+ bool "stall monitor"
+ help
+ Enable the stall sample monitor that illustrates the usage of hybrid
+ automata monitors. It can be used to identify tasks stalled for
+ longer than a threshold.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_stall.rst
diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c
new file mode 100644
index 000000000000..9ccfda6b0e73
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "stall"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+
+#define RV_MON_TYPE RV_MON_PER_TASK
+#define HA_TIMER_TYPE HA_TIMER_WHEEL
+#include "stall.h"
+#include <rv/ha_monitor.h>
+
+static u64 threshold_jiffies = 1000;
+module_param(threshold_jiffies, ullong, 0644);
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns)
+{
+ if (env == clk_stall)
+ return ha_get_clk_jiffy(ha_mon, env);
+ return ENV_INVALID_VALUE;
+}
+
+static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns)
+{
+ if (env == clk_stall)
+ ha_reset_clk_jiffy(ha_mon, env);
+}
+
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == enqueued_stall)
+ return ha_check_invariant_jiffy(ha_mon, clk_stall, time_ns);
+ return true;
+}
+
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ bool res = true;
+
+ if (curr_state == dequeued_stall && event == sched_wakeup_stall)
+ ha_reset_env(ha_mon, clk_stall, time_ns);
+ else if (curr_state == running_stall && event == sched_switch_preempt_stall)
+ ha_reset_env(ha_mon, clk_stall, time_ns);
+ return res;
+}
+
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (next_state == curr_state)
+ return;
+ if (next_state == enqueued_stall)
+ ha_start_timer_jiffy(ha_mon, clk_stall, threshold_jiffies, time_ns);
+ else if (curr_state == enqueued_stall)
+ ha_cancel_timer(ha_mon);
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+ return true;
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ if (!preempt && prev_state != TASK_RUNNING)
+ da_handle_start_event(prev, sched_switch_wait_stall);
+ else
+ da_handle_event(prev, sched_switch_preempt_stall);
+ da_handle_event(next, sched_switch_in_stall);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+ da_handle_event(p, sched_wakeup_stall);
+}
+
+static int enable_stall(void)
+{
+ int retval;
+
+ retval = da_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("stall", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
+
+ return 0;
+}
+
+static void disable_stall(void)
+{
+ rv_this.enabled = 0;
+
+ rv_detach_trace_probe("stall", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
+
+ da_monitor_destroy();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "stall",
+ .description = "identify tasks stalled for longer than a threshold.",
+ .enable = enable_stall,
+ .disable = disable_stall,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static int __init register_stall(void)
+{
+ return rv_register_monitor(&rv_this, NULL);
+}
+
+static void __exit unregister_stall(void)
+{
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_stall);
+module_exit(unregister_stall);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("stall: identify tasks stalled for longer than a threshold.");
diff --git a/kernel/trace/rv/monitors/stall/stall.h b/kernel/trace/rv/monitors/stall/stall.h
new file mode 100644
index 000000000000..638520cb1082
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of stall automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#define MONITOR_NAME stall
+
+enum states_stall {
+ dequeued_stall,
+ enqueued_stall,
+ running_stall,
+ state_max_stall,
+};
+
+#define INVALID_STATE state_max_stall
+
+enum events_stall {
+ sched_switch_in_stall,
+ sched_switch_preempt_stall,
+ sched_switch_wait_stall,
+ sched_wakeup_stall,
+ event_max_stall,
+};
+
+enum envs_stall {
+ clk_stall,
+ env_max_stall,
+ env_max_stored_stall = env_max_stall,
+};
+
+_Static_assert(env_max_stored_stall <= MAX_HA_ENV_LEN, "Not enough slots");
+
+struct automaton_stall {
+ char *state_names[state_max_stall];
+ char *event_names[event_max_stall];
+ char *env_names[env_max_stall];
+ unsigned char function[state_max_stall][event_max_stall];
+ unsigned char initial_state;
+ bool final_states[state_max_stall];
+};
+
+static const struct automaton_stall automaton_stall = {
+ .state_names = {
+ "dequeued",
+ "enqueued",
+ "running",
+ },
+ .event_names = {
+ "sched_switch_in",
+ "sched_switch_preempt",
+ "sched_switch_wait",
+ "sched_wakeup",
+ },
+ .env_names = {
+ "clk",
+ },
+ .function = {
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ enqueued_stall,
+ },
+ {
+ running_stall,
+ INVALID_STATE,
+ INVALID_STATE,
+ enqueued_stall,
+ },
+ {
+ running_stall,
+ enqueued_stall,
+ dequeued_stall,
+ running_stall,
+ },
+ },
+ .initial_state = dequeued_stall,
+ .final_states = { 1, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/stall/stall_trace.h b/kernel/trace/rv/monitors/stall/stall_trace.h
new file mode 100644
index 000000000000..6a7cc1b1d040
--- /dev/null
+++ b/kernel/trace/rv/monitors/stall/stall_trace.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_STALL
+DEFINE_EVENT(event_da_monitor_id, event_stall,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_stall,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_stall,
+ TP_PROTO(int id, char *state, char *event, char *env),
+ TP_ARGS(id, state, event, env));
+#endif /* CONFIG_RV_MON_STALL */
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 4a6faddac614..9622c269789c 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -62,9 +62,39 @@ DECLARE_EVENT_CLASS(error_da_monitor,
#include <monitors/scpd/scpd_trace.h>
#include <monitors/snep/snep_trace.h>
#include <monitors/sts/sts_trace.h>
-#include <monitors/opid/opid_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
+#ifdef CONFIG_HA_MON_EVENTS_IMPLICIT
+/* For simplicity this class is marked as DA although relevant only for HA */
+DECLARE_EVENT_CLASS(error_env_da_monitor,
+
+ TP_PROTO(char *state, char *event, char *env),
+
+ TP_ARGS(state, event, env),
+
+ TP_STRUCT__entry(
+ __string( state, state )
+ __string( event, event )
+ __string( env, env )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(env);
+ ),
+
+ TP_printk("event %s not expected in the state %s with env %s",
+ __get_str(event),
+ __get_str(state),
+ __get_str(env))
+);
+
+#include <monitors/opid/opid_trace.h>
+// Add new monitors based on CONFIG_HA_MON_EVENTS_IMPLICIT here
+
+#endif
+
#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
#ifdef CONFIG_DA_MON_EVENTS_ID
@@ -128,6 +158,41 @@ DECLARE_EVENT_CLASS(error_da_monitor_id,
#include <monitors/sssw/sssw_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
+#ifdef CONFIG_HA_MON_EVENTS_ID
+/* For simplicity this class is marked as DA although relevant only for HA */
+DECLARE_EVENT_CLASS(error_env_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event, char *env),
+
+ TP_ARGS(id, state, event, env),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
+ __string( env, env )
+ ),
+
+ TP_fast_assign(
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(env);
+ __entry->id = id;
+ ),
+
+ TP_printk("%d: event %s not expected in the state %s with env %s",
+ __entry->id,
+ __get_str(event),
+ __get_str(state),
+ __get_str(env))
+);
+
+#include <monitors/stall/stall_trace.h>
+#include <monitors/nomiss/nomiss_trace.h>
+// Add new monitors based on CONFIG_HA_MON_EVENTS_ID here
+
+#endif
+
#endif /* CONFIG_DA_MON_EVENTS_ID */
#ifdef CONFIG_LTL_MON_EVENTS_ID
DECLARE_EVENT_CLASS(event_ltl_monitor_id,
diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
new file mode 100644
index 000000000000..02af2297ae5a
--- /dev/null
+++ b/kernel/trace/simple_ring_buffer.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/simple_ring_buffer.h>
+
+#include <asm/barrier.h>
+#include <asm/local.h>
+
+enum simple_rb_link_type {
+ SIMPLE_RB_LINK_NORMAL = 0,
+ SIMPLE_RB_LINK_HEAD = 1,
+ SIMPLE_RB_LINK_HEAD_MOVING
+};
+
+#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING)
+
+static void simple_bpage_set_head_link(struct simple_buffer_page *bpage)
+{
+ unsigned long link = (unsigned long)bpage->link.next;
+
+ link &= SIMPLE_RB_LINK_MASK;
+ link |= SIMPLE_RB_LINK_HEAD;
+
+ /*
+ * Paired with simple_rb_find_head() to order access between the head
+ * link and overrun. It ensures we always report an up-to-date value
+ * after swapping the reader page.
+ */
+ smp_store_release(&bpage->link.next, (struct list_head *)link);
+}
+
+static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage,
+ struct simple_buffer_page *dst,
+ enum simple_rb_link_type new_type)
+{
+ unsigned long *link = (unsigned long *)(&bpage->link.next);
+ unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD;
+ unsigned long new = (unsigned long)(&dst->link) | new_type;
+
+ return try_cmpxchg(link, &old, new);
+}
+
+static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage)
+{
+ unsigned long link = (unsigned long)bpage->link.next;
+
+ WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK));
+}
+
+static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link)
+{
+ unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK;
+
+ return container_of((struct list_head *)ptr, struct simple_buffer_page, link);
+}
+
+static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage)
+{
+ return simple_bpage_from_link(bpage->link.next);
+}
+
+static void simple_bpage_reset(struct simple_buffer_page *bpage)
+{
+ bpage->write = 0;
+ bpage->entries = 0;
+
+ local_set(&bpage->page->commit, 0);
+}
+
+static void simple_bpage_init(struct simple_buffer_page *bpage, void *page)
+{
+ INIT_LIST_HEAD(&bpage->link);
+ bpage->page = (struct buffer_data_page *)page;
+
+ simple_bpage_reset(bpage);
+}
+
+#define simple_rb_meta_inc(__meta, __inc) \
+ WRITE_ONCE((__meta), (__meta + __inc))
+
+static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer)
+{
+ return !!cpu_buffer->bpages;
+}
+
+static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer)
+{
+ int retry = cpu_buffer->nr_pages * 2;
+ struct simple_buffer_page *head;
+
+ head = cpu_buffer->head_page;
+
+ while (retry--) {
+ unsigned long link;
+
+spin:
+ /* See smp_store_release in simple_bpage_set_head_link() */
+ link = (unsigned long)smp_load_acquire(&head->link.prev->next);
+
+ switch (link & ~SIMPLE_RB_LINK_MASK) {
+ /* Found the head */
+ case SIMPLE_RB_LINK_HEAD:
+ cpu_buffer->head_page = head;
+ return 0;
+ /* The writer caught the head, we can spin, that won't be long */
+ case SIMPLE_RB_LINK_HEAD_MOVING:
+ goto spin;
+ }
+
+ head = simple_bpage_next_page(head);
+ }
+
+ return -EBUSY;
+}
+
+/**
+ * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader
+ * @cpu_buffer: A simple_rb_per_cpu
+ *
+ * This function enables consuming reading. It ensures the current head page will not be overwritten
+ * and can be safely read.
+ *
+ * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the
+ * head page.
+ */
+int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *last, *head, *reader;
+ unsigned long overrun;
+ int retry = 8;
+ int ret;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ reader = cpu_buffer->reader_page;
+
+ do {
+ /* Run after the writer to find the head */
+ ret = simple_rb_find_head(cpu_buffer);
+ if (ret)
+ return ret;
+
+ head = cpu_buffer->head_page;
+
+ /* Connect the reader page around the header page */
+ reader->link.next = head->link.next;
+ reader->link.prev = head->link.prev;
+
+ /* The last page before the head */
+ last = simple_bpage_from_link(head->link.prev);
+
+ /* The reader page points to the new header page */
+ simple_bpage_set_head_link(reader);
+
+ overrun = cpu_buffer->meta->overrun;
+ } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--);
+
+ if (!retry)
+ return -EINVAL;
+
+ cpu_buffer->head_page = simple_bpage_from_link(reader->link.next);
+ cpu_buffer->head_page->link.prev = &reader->link;
+ cpu_buffer->reader_page = head;
+ cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun;
+ cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id;
+ cpu_buffer->last_overrun = overrun;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page);
+
+static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *tail, *new_tail;
+
+ tail = cpu_buffer->tail_page;
+ new_tail = simple_bpage_next_page(tail);
+
+ if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) {
+ /*
+ * Oh no! we've caught the head. There is none anymore and
+ * swap_reader will spin until we set the new one. Overrun must
+ * be written first, to make sure we report the correct number
+ * of lost events.
+ */
+ simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries);
+ simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1);
+
+ simple_bpage_set_head_link(new_tail);
+ simple_bpage_set_normal_link(tail);
+ }
+
+ simple_bpage_reset(new_tail);
+ cpu_buffer->tail_page = new_tail;
+
+ simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1);
+
+ return new_tail;
+}
+
+static unsigned long rb_event_size(unsigned long length)
+{
+ struct ring_buffer_event *event;
+
+ return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]);
+}
+
+static struct ring_buffer_event *
+rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta)
+{
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+
+ return (struct ring_buffer_event *)((unsigned long)event + 8);
+}
+
+static struct ring_buffer_event *
+simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp)
+{
+ unsigned long ts_ext_size = 0, event_size = rb_event_size(length);
+ struct simple_buffer_page *tail = cpu_buffer->tail_page;
+ struct ring_buffer_event *event;
+ u32 write, prev_write;
+ u64 time_delta;
+
+ time_delta = timestamp - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(time_delta))
+ ts_ext_size = 8;
+
+ prev_write = tail->write;
+ write = prev_write + event_size + ts_ext_size;
+
+ if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE)))
+ tail = simple_rb_move_tail(cpu_buffer);
+
+ if (!tail->entries) {
+ tail->page->time_stamp = timestamp;
+ time_delta = 0;
+ ts_ext_size = 0;
+ write = event_size;
+ prev_write = 0;
+ }
+
+ tail->write = write;
+ tail->entries++;
+
+ cpu_buffer->write_stamp = timestamp;
+
+ event = (struct ring_buffer_event *)(tail->page->data + prev_write);
+ if (ts_ext_size) {
+ event = rb_event_add_ts_extend(event, time_delta);
+ time_delta = 0;
+ }
+
+ event->type_len = 0;
+ event->time_delta = time_delta;
+ event->array[0] = event_size - RB_EVNT_HDR_SIZE;
+
+ return event;
+}
+
+/**
+ * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ * @length: Size of the entry in bytes
+ * @timestamp: Timestamp of the entry
+ *
+ * Returns the address of the entry where to write data or NULL
+ */
+void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length,
+ u64 timestamp)
+{
+ struct ring_buffer_event *rb_event;
+
+ if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY)
+ return NULL;
+
+ rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp);
+
+ return &rb_event->array[1];
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve);
+
+/**
+ * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve()
+ * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved
+ */
+void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->tail_page->page->commit,
+ cpu_buffer->tail_page->write);
+ simple_rb_meta_inc(cpu_buffer->meta->entries, 1);
+
+ /*
+ * Paired with simple_rb_enable_tracing() to ensure data is
+ * written to the ring-buffer before teardown.
+ */
+ smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_commit);
+
+static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
+{
+ u32 prev_status;
+
+ if (enable)
+ return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY);
+
+ /* Wait for the buffer to be released */
+ do {
+ prev_status = cmpxchg_acquire(&cpu_buffer->status,
+ SIMPLE_RB_READY,
+ SIMPLE_RB_UNAVAILABLE);
+ } while (prev_status == SIMPLE_RB_WRITING);
+
+ return prev_status;
+}
+
+/**
+ * simple_ring_buffer_reset - Reset @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ *
+ * This will not clear the content of the data, only reset counters and pointers
+ *
+ * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded.
+ */
+int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *bpage;
+ u32 prev_status;
+ int ret;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ prev_status = simple_rb_enable_tracing(cpu_buffer, false);
+
+ ret = simple_rb_find_head(cpu_buffer);
+ if (ret)
+ return ret;
+
+ bpage = cpu_buffer->tail_page = cpu_buffer->head_page;
+ do {
+ simple_bpage_reset(bpage);
+ bpage = simple_bpage_next_page(bpage);
+ } while (bpage != cpu_buffer->head_page);
+
+ simple_bpage_reset(cpu_buffer->reader_page);
+
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->write_stamp = 0;
+
+ cpu_buffer->meta->reader.read = 0;
+ cpu_buffer->meta->reader.lost_events = 0;
+ cpu_buffer->meta->entries = 0;
+ cpu_buffer->meta->overrun = 0;
+ cpu_buffer->meta->read = 0;
+ cpu_buffer->meta->pages_lost = 0;
+ cpu_buffer->meta->pages_touched = 0;
+
+ if (prev_status == SIMPLE_RB_READY)
+ simple_rb_enable_tracing(cpu_buffer, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_reset);
+
+int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
+ struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc,
+ void *(*load_page)(unsigned long va),
+ void (*unload_page)(void *va))
+{
+ struct simple_buffer_page *bpage = bpages;
+ int ret = 0;
+ void *page;
+ int i;
+
+ /* At least 1 reader page and two pages in the ring-buffer */
+ if (desc->nr_page_va < 3)
+ return -EINVAL;
+
+ memset(cpu_buffer, 0, sizeof(*cpu_buffer));
+
+ cpu_buffer->meta = load_page(desc->meta_va);
+ if (!cpu_buffer->meta)
+ return -EINVAL;
+
+ memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta));
+ cpu_buffer->meta->meta_page_size = PAGE_SIZE;
+ cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
+
+ /* The reader page is not part of the ring initially */
+ page = load_page(desc->page_va[0]);
+ if (!page) {
+ unload_page(cpu_buffer->meta);
+ return -EINVAL;
+ }
+
+ simple_bpage_init(bpage, page);
+ bpage->id = 0;
+
+ cpu_buffer->nr_pages = 1;
+
+ cpu_buffer->reader_page = bpage;
+ cpu_buffer->tail_page = bpage + 1;
+ cpu_buffer->head_page = bpage + 1;
+
+ for (i = 1; i < desc->nr_page_va; i++) {
+ page = load_page(desc->page_va[i]);
+ if (!page) {
+ ret = -EINVAL;
+ break;
+ }
+
+ simple_bpage_init(++bpage, page);
+
+ bpage->link.next = &(bpage + 1)->link;
+ bpage->link.prev = &(bpage - 1)->link;
+ bpage->id = i;
+
+ cpu_buffer->nr_pages = i + 1;
+ }
+
+ if (ret) {
+ for (i--; i >= 0; i--)
+ unload_page((void *)desc->page_va[i]);
+ unload_page(cpu_buffer->meta);
+
+ return ret;
+ }
+
+ /* Close the ring */
+ bpage->link.next = &cpu_buffer->tail_page->link;
+ cpu_buffer->tail_page->link.prev = &bpage->link;
+
+ /* The last init'ed page points to the head page */
+ simple_bpage_set_head_link(bpage);
+
+ cpu_buffer->bpages = bpages;
+
+ return 0;
+}
+
+static void *__load_page(unsigned long page)
+{
+ return (void *)page;
+}
+
+static void __unload_page(void *page) { }
+
+/**
+ * simple_ring_buffer_init - Init @cpu_buffer based on @desc
+ * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller.
+ * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va
+ * @desc: A ring_buffer_desc
+ *
+ * Returns 0 on success or -EINVAL if the content of @desc is invalid
+ */
+int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc)
+{
+ return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_init);
+
+void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer,
+ void (*unload_page)(void *))
+{
+ int p;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return;
+
+ simple_rb_enable_tracing(cpu_buffer, false);
+
+ unload_page(cpu_buffer->meta);
+ for (p = 0; p < cpu_buffer->nr_pages; p++)
+ unload_page(cpu_buffer->bpages[p].page);
+
+ cpu_buffer->bpages = NULL;
+}
+
+/**
+ * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion
+ * @cpu_buffer: A simple_rb_per_cpu that will be deleted.
+ */
+void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer)
+{
+ return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_unload);
+
+/**
+ * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ * @enable: True to enable tracing, False to disable it
+ *
+ * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded
+ */
+int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
+{
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ simple_rb_enable_tracing(cpu_buffer, enable);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 23de3719f495..6eb4d3097a4d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -47,7 +47,6 @@
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
-#include <linux/fsnotify.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>
#include <linux/sort.h>
@@ -219,14 +218,36 @@ static void ftrace_trace_userstack(struct trace_array *tr,
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
-static bool allocate_snapshot;
-static bool snapshot_at_boot;
-
static char boot_instance_info[COMMAND_LINE_SIZE] __initdata;
static int boot_instance_index;
-static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
-static int boot_snapshot_index;
+/*
+ * Repeated boot parameters, including Bootconfig array expansions, need
+ * to stay in the delimiter form that the existing parser consumes.
+ */
+void __init trace_append_boot_param(char *buf, const char *str, char sep,
+ int size)
+{
+ int len, needed, str_len;
+
+ if (!*str)
+ return;
+
+ len = strlen(buf);
+ str_len = strlen(str);
+ needed = len + str_len + 1;
+
+ /* For continuation, account for the separator. */
+ if (len)
+ needed++;
+ if (needed > size)
+ return;
+
+ if (len)
+ buf[len++] = sep;
+
+ strscpy(buf + len, str, size - len);
+}
static int __init set_cmdline_ftrace(char *str)
{
@@ -276,38 +297,6 @@ static int __init stop_trace_on_warning(char *str)
}
__setup("traceoff_on_warning", stop_trace_on_warning);
-static int __init boot_alloc_snapshot(char *str)
-{
- char *slot = boot_snapshot_info + boot_snapshot_index;
- int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
- int ret;
-
- if (str[0] == '=') {
- str++;
- if (strlen(str) >= left)
- return -1;
-
- ret = snprintf(slot, left, "%s\t", str);
- boot_snapshot_index += ret;
- } else {
- allocate_snapshot = true;
- /* We also need the main ring buffer expanded */
- trace_set_ring_buffer_expanded(NULL);
- }
- return 1;
-}
-__setup("alloc_snapshot", boot_alloc_snapshot);
-
-
-static int __init boot_snapshot(char *str)
-{
- snapshot_at_boot = true;
- boot_alloc_snapshot(str);
- return 1;
-}
-__setup("ftrace_boot_snapshot", boot_snapshot);
-
-
static int __init boot_instance(char *str)
{
char *slot = boot_instance_info + boot_instance_index;
@@ -329,7 +318,8 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
- strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+ trace_append_boot_param(trace_boot_options_buf, str, ',',
+ MAX_TRACER_SIZE);
return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -555,7 +545,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled)
lockdep_assert_held(&event_mutex);
if (enabled) {
- if (!list_empty(&tr->marker_list))
+ if (tr->trace_flags & TRACE_ITER(COPY_MARKER))
return false;
list_add_rcu(&tr->marker_list, &marker_copies);
@@ -563,10 +553,10 @@ static bool update_marker_trace(struct trace_array *tr, int enabled)
return true;
}
- if (list_empty(&tr->marker_list))
+ if (!(tr->trace_flags & TRACE_ITER(COPY_MARKER)))
return false;
- list_del_init(&tr->marker_list);
+ list_del_rcu(&tr->marker_list);
tr->trace_flags &= ~TRACE_ITER(COPY_MARKER);
return true;
}
@@ -578,8 +568,59 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
tr->ring_buffer_expanded = true;
}
+static void trace_array_autoremove(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array, autoremove_work);
+
+ trace_array_destroy(tr);
+}
+
+static struct workqueue_struct *autoremove_wq;
+
+static void trace_array_kick_autoremove(struct trace_array *tr)
+{
+ if (autoremove_wq)
+ queue_work(autoremove_wq, &tr->autoremove_work);
+}
+
+static void trace_array_cancel_autoremove(struct trace_array *tr)
+{
+ /*
+ * Since this can be called inside trace_array_autoremove(),
+ * it has to avoid deadlock of the workqueue.
+ */
+ if (work_pending(&tr->autoremove_work))
+ cancel_work_sync(&tr->autoremove_work);
+}
+
+static void trace_array_init_autoremove(struct trace_array *tr)
+{
+ INIT_WORK(&tr->autoremove_work, trace_array_autoremove);
+}
+
+static void trace_array_start_autoremove(void)
+{
+ if (autoremove_wq)
+ return;
+
+ autoremove_wq = alloc_workqueue("tr_autoremove_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!autoremove_wq)
+ pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n");
+}
+
LIST_HEAD(ftrace_trace_arrays);
+static int __trace_array_get(struct trace_array *this_tr)
+{
+ /* When free_on_close is set, this is not available anymore. */
+ if (autoremove_wq && this_tr->free_on_close)
+ return -ENODEV;
+
+ this_tr->ref++;
+ return 0;
+}
+
int trace_array_get(struct trace_array *this_tr)
{
struct trace_array *tr;
@@ -587,8 +628,7 @@ int trace_array_get(struct trace_array *this_tr)
guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr == this_tr) {
- tr->ref++;
- return 0;
+ return __trace_array_get(tr);
}
}
@@ -599,6 +639,12 @@ static void __trace_array_put(struct trace_array *this_tr)
{
WARN_ON(!this_tr->ref);
this_tr->ref--;
+ /*
+ * When free_on_close is set, prepare removing the array
+ * when the last reference is released.
+ */
+ if (this_tr->ref == 1 && this_tr->free_on_close)
+ trace_array_kick_autoremove(this_tr);
}
/**
@@ -807,47 +853,6 @@ void tracing_on(void)
EXPORT_SYMBOL_GPL(tracing_on);
#ifdef CONFIG_TRACER_SNAPSHOT
-static void tracing_snapshot_instance_cond(struct trace_array *tr,
- void *cond_data)
-{
- unsigned long flags;
-
- if (in_nmi()) {
- trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- trace_array_puts(tr, "*** snapshot is being ignored ***\n");
- return;
- }
-
- if (!tr->allocated_snapshot) {
- trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
- trace_array_puts(tr, "*** stopping trace here! ***\n");
- tracer_tracing_off(tr);
- return;
- }
-
- if (tr->mapped) {
- trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
- trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
- return;
- }
-
- /* Note, snapshot can not be used when the tracer uses it */
- if (tracer_uses_snapshot(tr->current_trace)) {
- trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
- trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
- return;
- }
-
- local_irq_save(flags);
- update_max_tr(tr, current, smp_processor_id(), cond_data);
- local_irq_restore(flags);
-}
-
-void tracing_snapshot_instance(struct trace_array *tr)
-{
- tracing_snapshot_instance_cond(tr, NULL);
-}
-
/**
* tracing_snapshot - take a snapshot of the current buffer.
*
@@ -871,138 +876,6 @@ void tracing_snapshot(void)
EXPORT_SYMBOL_GPL(tracing_snapshot);
/**
- * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
- * @tr: The tracing instance to snapshot
- * @cond_data: The data to be tested conditionally, and possibly saved
- *
- * This is the same as tracing_snapshot() except that the snapshot is
- * conditional - the snapshot will only happen if the
- * cond_snapshot.update() implementation receiving the cond_data
- * returns true, which means that the trace array's cond_snapshot
- * update() operation used the cond_data to determine whether the
- * snapshot should be taken, and if it was, presumably saved it along
- * with the snapshot.
- */
-void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
-{
- tracing_snapshot_instance_cond(tr, cond_data);
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
-
-/**
- * tracing_cond_snapshot_data - get the user data associated with a snapshot
- * @tr: The tracing instance
- *
- * When the user enables a conditional snapshot using
- * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
- * with the snapshot. This accessor is used to retrieve it.
- *
- * Should not be called from cond_snapshot.update(), since it takes
- * the tr->max_lock lock, which the code calling
- * cond_snapshot.update() has already done.
- *
- * Returns the cond_data associated with the trace array's snapshot.
- */
-void *tracing_cond_snapshot_data(struct trace_array *tr)
-{
- void *cond_data = NULL;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
-
- if (tr->cond_snapshot)
- cond_data = tr->cond_snapshot->cond_data;
-
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- return cond_data;
-}
-EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
-
-static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
- struct array_buffer *size_buf, int cpu_id);
-static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
-
-int tracing_alloc_snapshot_instance(struct trace_array *tr)
-{
- int order;
- int ret;
-
- if (!tr->allocated_snapshot) {
-
- /* Make the snapshot buffer have the same order as main buffer */
- order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
- ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order);
- if (ret < 0)
- return ret;
-
- /* allocate spare buffer */
- ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
- &tr->array_buffer, RING_BUFFER_ALL_CPUS);
- if (ret < 0)
- return ret;
-
- tr->allocated_snapshot = true;
- }
-
- return 0;
-}
-
-static void free_snapshot(struct trace_array *tr)
-{
- /*
- * We don't free the ring buffer. instead, resize it because
- * The max_tr ring buffer has some state (e.g. ring->clock) and
- * we want preserve it.
- */
- ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0);
- ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
- set_buffer_entries(&tr->snapshot_buffer, 1);
- tracing_reset_online_cpus(&tr->snapshot_buffer);
- tr->allocated_snapshot = false;
-}
-
-static int tracing_arm_snapshot_locked(struct trace_array *tr)
-{
- int ret;
-
- lockdep_assert_held(&trace_types_lock);
-
- spin_lock(&tr->snapshot_trigger_lock);
- if (tr->snapshot == UINT_MAX || tr->mapped) {
- spin_unlock(&tr->snapshot_trigger_lock);
- return -EBUSY;
- }
-
- tr->snapshot++;
- spin_unlock(&tr->snapshot_trigger_lock);
-
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret) {
- spin_lock(&tr->snapshot_trigger_lock);
- tr->snapshot--;
- spin_unlock(&tr->snapshot_trigger_lock);
- }
-
- return ret;
-}
-
-int tracing_arm_snapshot(struct trace_array *tr)
-{
- guard(mutex)(&trace_types_lock);
- return tracing_arm_snapshot_locked(tr);
-}
-
-void tracing_disarm_snapshot(struct trace_array *tr)
-{
- spin_lock(&tr->snapshot_trigger_lock);
- if (!WARN_ON(!tr->snapshot))
- tr->snapshot--;
- spin_unlock(&tr->snapshot_trigger_lock);
-}
-
-/**
* tracing_alloc_snapshot - allocate snapshot buffer.
*
* This only allocates the snapshot buffer if it isn't already
@@ -1022,159 +895,18 @@ int tracing_alloc_snapshot(void)
return ret;
}
-EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
-
-/**
- * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
- *
- * This is similar to tracing_snapshot(), but it will allocate the
- * snapshot buffer if it isn't already allocated. Use this only
- * where it is safe to sleep, as the allocation may sleep.
- *
- * This causes a swap between the snapshot buffer and the current live
- * tracing buffer. You can use this to take snapshots of the live
- * trace when some condition is triggered, but continue to trace.
- */
-void tracing_snapshot_alloc(void)
-{
- int ret;
-
- ret = tracing_alloc_snapshot();
- if (ret < 0)
- return;
-
- tracing_snapshot();
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
-
-/**
- * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
- * @tr: The tracing instance
- * @cond_data: User data to associate with the snapshot
- * @update: Implementation of the cond_snapshot update function
- *
- * Check whether the conditional snapshot for the given instance has
- * already been enabled, or if the current tracer is already using a
- * snapshot; if so, return -EBUSY, else create a cond_snapshot and
- * save the cond_data and update function inside.
- *
- * Returns 0 if successful, error otherwise.
- */
-int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
- cond_update_fn_t update)
-{
- struct cond_snapshot *cond_snapshot __free(kfree) =
- kzalloc_obj(*cond_snapshot);
- int ret;
-
- if (!cond_snapshot)
- return -ENOMEM;
-
- cond_snapshot->cond_data = cond_data;
- cond_snapshot->update = update;
-
- guard(mutex)(&trace_types_lock);
-
- if (tracer_uses_snapshot(tr->current_trace))
- return -EBUSY;
-
- /*
- * The cond_snapshot can only change to NULL without the
- * trace_types_lock. We don't care if we race with it going
- * to NULL, but we want to make sure that it's not set to
- * something other than NULL when we get here, which we can
- * do safely with only holding the trace_types_lock and not
- * having to take the max_lock.
- */
- if (tr->cond_snapshot)
- return -EBUSY;
-
- ret = tracing_arm_snapshot_locked(tr);
- if (ret)
- return ret;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
- tr->cond_snapshot = no_free_ptr(cond_snapshot);
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
-
-/**
- * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
- * @tr: The tracing instance
- *
- * Check whether the conditional snapshot for the given instance is
- * enabled; if so, free the cond_snapshot associated with it,
- * otherwise return -EINVAL.
- *
- * Returns 0 if successful, error otherwise.
- */
-int tracing_snapshot_cond_disable(struct trace_array *tr)
-{
- int ret = 0;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
-
- if (!tr->cond_snapshot)
- ret = -EINVAL;
- else {
- kfree(tr->cond_snapshot);
- tr->cond_snapshot = NULL;
- }
-
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
-
- tracing_disarm_snapshot(tr);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#else
void tracing_snapshot(void)
{
WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
-void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
-{
- WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
-int tracing_alloc_snapshot(void)
-{
- WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
- return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
void tracing_snapshot_alloc(void)
{
/* Give warning */
tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
-void *tracing_cond_snapshot_data(struct trace_array *tr)
-{
- return NULL;
-}
-EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
-int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
-{
- return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
-int tracing_snapshot_cond_disable(struct trace_array *tr)
-{
- return false;
-}
-EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
-#define free_snapshot(tr) do { } while (0)
-#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; })
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1487,206 +1219,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
unsigned long __read_mostly tracing_thresh;
-#ifdef CONFIG_TRACER_MAX_TRACE
-#ifdef LATENCY_FS_NOTIFY
-static struct workqueue_struct *fsnotify_wq;
-
-static void latency_fsnotify_workfn(struct work_struct *work)
-{
- struct trace_array *tr = container_of(work, struct trace_array,
- fsnotify_work);
- fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
-}
-
-static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
-{
- struct trace_array *tr = container_of(iwork, struct trace_array,
- fsnotify_irqwork);
- queue_work(fsnotify_wq, &tr->fsnotify_work);
-}
-
-__init static int latency_fsnotify_init(void)
-{
- fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
- WQ_UNBOUND | WQ_HIGHPRI, 0);
- if (!fsnotify_wq) {
- pr_err("Unable to allocate tr_max_lat_wq\n");
- return -ENOMEM;
- }
- return 0;
-}
-
-late_initcall_sync(latency_fsnotify_init);
-
-void latency_fsnotify(struct trace_array *tr)
-{
- if (!fsnotify_wq)
- return;
- /*
- * We cannot call queue_work(&tr->fsnotify_work) from here because it's
- * possible that we are called from __schedule() or do_idle(), which
- * could cause a deadlock.
- */
- irq_work_queue(&tr->fsnotify_irqwork);
-}
-#endif /* !LATENCY_FS_NOTIFY */
-
-static const struct file_operations tracing_max_lat_fops;
-
-static void trace_create_maxlat_file(struct trace_array *tr,
- struct dentry *d_tracer)
-{
-#ifdef LATENCY_FS_NOTIFY
- INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
- init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
-#endif
- tr->d_max_latency = trace_create_file("tracing_max_latency",
- TRACE_MODE_WRITE,
- d_tracer, tr,
- &tracing_max_lat_fops);
-}
-
-/*
- * Copy the new maximum trace into the separate maximum-trace
- * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
- */
-static void
-__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
-{
- struct array_buffer *trace_buf = &tr->array_buffer;
- struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
- struct array_buffer *max_buf = &tr->snapshot_buffer;
- struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
-
- max_buf->cpu = cpu;
- max_buf->time_start = data->preempt_timestamp;
-
- max_data->saved_latency = tr->max_latency;
- max_data->critical_start = data->critical_start;
- max_data->critical_end = data->critical_end;
-
- strscpy(max_data->comm, tsk->comm);
- max_data->pid = tsk->pid;
- /*
- * If tsk == current, then use current_uid(), as that does not use
- * RCU. The irq tracer can be called out of RCU scope.
- */
- if (tsk == current)
- max_data->uid = current_uid();
- else
- max_data->uid = task_uid(tsk);
-
- max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
- max_data->policy = tsk->policy;
- max_data->rt_priority = tsk->rt_priority;
-
- /* record this tasks comm */
- tracing_record_cmdline(tsk);
- latency_fsnotify(tr);
-}
-#else
-static inline void trace_create_maxlat_file(struct trace_array *tr,
- struct dentry *d_tracer) { }
-static inline void __update_max_tr(struct trace_array *tr,
- struct task_struct *tsk, int cpu) { }
-#endif /* CONFIG_TRACER_MAX_TRACE */
-
-#ifdef CONFIG_TRACER_SNAPSHOT
-/**
- * update_max_tr - snapshot all trace buffers from global_trace to max_tr
- * @tr: tracer
- * @tsk: the task with the latency
- * @cpu: The cpu that initiated the trace.
- * @cond_data: User data associated with a conditional snapshot
- *
- * Flip the buffers between the @tr and the max_tr and record information
- * about which task was the cause of this latency.
- */
-void
-update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
- void *cond_data)
-{
- if (tr->stop_count)
- return;
-
- WARN_ON_ONCE(!irqs_disabled());
-
- if (!tr->allocated_snapshot) {
- /* Only the nop tracer should hit this when disabling */
- WARN_ON_ONCE(tr->current_trace != &nop_trace);
- return;
- }
-
- arch_spin_lock(&tr->max_lock);
-
- /* Inherit the recordable setting from array_buffer */
- if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
- ring_buffer_record_on(tr->snapshot_buffer.buffer);
- else
- ring_buffer_record_off(tr->snapshot_buffer.buffer);
-
- if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
- arch_spin_unlock(&tr->max_lock);
- return;
- }
-
- swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer);
-
- __update_max_tr(tr, tsk, cpu);
-
- arch_spin_unlock(&tr->max_lock);
-
- /* Any waiters on the old snapshot buffer need to wake up */
- ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
-}
-
-/**
- * update_max_tr_single - only copy one trace over, and reset the rest
- * @tr: tracer
- * @tsk: task with the latency
- * @cpu: the cpu of the buffer to copy.
- *
- * Flip the trace of a single CPU buffer between the @tr and the max_tr.
- */
-void
-update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
-{
- int ret;
-
- if (tr->stop_count)
- return;
-
- WARN_ON_ONCE(!irqs_disabled());
- if (!tr->allocated_snapshot) {
- /* Only the nop tracer should hit this when disabling */
- WARN_ON_ONCE(tr->current_trace != &nop_trace);
- return;
- }
-
- arch_spin_lock(&tr->max_lock);
-
- ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu);
-
- if (ret == -EBUSY) {
- /*
- * We failed to swap the buffer due to a commit taking
- * place on this CPU. We fail to record, but we reset
- * the max trace buffer (no one writes directly to it)
- * and flag that it failed.
- * Another reason is resize is in progress.
- */
- trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_,
- "Failed to swap buffers due to commit or resize in progress\n");
- }
-
- WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
-
- __update_max_tr(tr, tsk, cpu);
- arch_spin_unlock(&tr->max_lock);
-}
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
struct pipe_wait {
struct trace_iterator *iter;
int wait_index;
@@ -1995,7 +1527,7 @@ int __init register_tracer(struct tracer *type)
return 0;
}
-static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
+void tracing_reset_cpu(struct array_buffer *buf, int cpu)
{
struct trace_buffer *buffer = buf->buffer;
@@ -3760,50 +3292,6 @@ static void test_ftrace_alive(struct seq_file *m)
"# MAY BE MISSING FUNCTION EVENTS\n");
}
-#ifdef CONFIG_TRACER_SNAPSHOT
-static void show_snapshot_main_help(struct seq_file *m)
-{
- seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
- "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
- "# Takes a snapshot of the main buffer.\n"
- "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
- "# (Doesn't have to be '2' works with any number that\n"
- "# is not a '0' or '1')\n");
-}
-
-static void show_snapshot_percpu_help(struct seq_file *m)
-{
- seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
-#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
- "# Takes a snapshot of the main buffer for this cpu.\n");
-#else
- seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
- "# Must use main snapshot file to allocate.\n");
-#endif
- seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
- "# (Doesn't have to be '2' works with any number that\n"
- "# is not a '0' or '1')\n");
-}
-
-static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
-{
- if (iter->tr->allocated_snapshot)
- seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
- else
- seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
-
- seq_puts(m, "# Snapshot commands:\n");
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- show_snapshot_main_help(m);
- else
- show_snapshot_percpu_help(m);
-}
-#else
-/* Should never be called */
-static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
-#endif
-
static int s_show(struct seq_file *m, void *v)
{
struct trace_iterator *iter = v;
@@ -3852,17 +3340,6 @@ static int s_show(struct seq_file *m, void *v)
return 0;
}
-/*
- * Should be used after trace_array_get(), trace_types_lock
- * ensures that i_cdev was already initialized.
- */
-static inline int tracing_get_cpu(struct inode *inode)
-{
- if (inode->i_cdev) /* See trace_create_cpu_file() */
- return (long)inode->i_cdev - 1;
- return RING_BUFFER_ALL_CPUS;
-}
-
static const struct seq_operations tracer_seq_ops = {
.start = s_start,
.next = s_next,
@@ -3889,7 +3366,7 @@ static void free_trace_iter_content(struct trace_iterator *iter)
free_cpumask_var(iter->started);
}
-static struct trace_iterator *
+struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
struct trace_array *tr = inode->i_private;
@@ -4022,6 +3499,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
if (ret)
return ret;
+ if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+ trace_array_put(tr);
+ return -EACCES;
+ }
+
filp->private_data = inode->i_private;
return 0;
@@ -4050,8 +3532,6 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp)
event_file_get(file);
}
- filp->private_data = inode->i_private;
-
return 0;
}
@@ -4071,7 +3551,7 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
return single_release(inode, filp);
}
-static int tracing_release(struct inode *inode, struct file *file)
+int tracing_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
struct seq_file *m = file->private_data;
@@ -5222,7 +4702,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
return t->init(tr);
}
-static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
+void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val)
{
int cpu;
@@ -5233,40 +4713,12 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
static void update_buffer_entries(struct array_buffer *buf, int cpu)
{
if (cpu == RING_BUFFER_ALL_CPUS) {
- set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
+ trace_set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
} else {
per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
}
}
-#ifdef CONFIG_TRACER_SNAPSHOT
-/* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
- struct array_buffer *size_buf, int cpu_id)
-{
- int cpu, ret = 0;
-
- if (cpu_id == RING_BUFFER_ALL_CPUS) {
- for_each_tracing_cpu(cpu) {
- ret = ring_buffer_resize(trace_buf->buffer,
- per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
- if (ret < 0)
- break;
- per_cpu_ptr(trace_buf->data, cpu)->entries =
- per_cpu_ptr(size_buf->data, cpu)->entries;
- }
- } else {
- ret = ring_buffer_resize(trace_buf->buffer,
- per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
- if (ret == 0)
- per_cpu_ptr(trace_buf->data, cpu_id)->entries =
- per_cpu_ptr(size_buf->data, cpu_id)->entries;
- }
-
- return ret;
-}
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
static int __tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu)
{
@@ -5462,6 +4914,10 @@ static void update_last_data(struct trace_array *tr)
/* Only if the buffer has previous boot data clear and update it. */
tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT;
+ /* If this is a backup instance, mark it for autoremove. */
+ if (tr->flags & TRACE_ARRAY_FL_VMALLOC)
+ tr->free_on_close = true;
+
/* Reset the module list and reload them */
if (tr->scratch) {
struct trace_scratch *tscratch = tr->scratch;
@@ -5685,9 +5141,8 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
return ret;
}
-static ssize_t
-tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
char buf[64];
int r;
@@ -5699,9 +5154,8 @@ tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static ssize_t
-tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
unsigned long val;
int ret;
@@ -5743,28 +5197,6 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-#ifdef CONFIG_TRACER_MAX_TRACE
-
-static ssize_t
-tracing_max_lat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct trace_array *tr = filp->private_data;
-
- return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
-}
-
-static ssize_t
-tracing_max_lat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- struct trace_array *tr = filp->private_data;
-
- return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
-}
-
-#endif
-
static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
{
if (cpu == RING_BUFFER_ALL_CPUS) {
@@ -6784,6 +6216,23 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
do {
/*
+ * It is possible that something is trying to migrate this
+ * task. What happens then, is when preemption is enabled,
+ * the migration thread will preempt this task, try to
+ * migrate it, fail, then let it run again. That will
+ * cause this to loop again and never succeed.
+ * On failures, enabled and disable preemption with
+ * migration enabled, to allow the migration thread to
+ * migrate this task.
+ */
+ if (trys) {
+ preempt_enable_notrace();
+ preempt_disable_notrace();
+ cpu = smp_processor_id();
+ buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
+ }
+
+ /*
* If for some reason, copy_from_user() always causes a context
* switch, this would then cause an infinite loop.
* If this task is preempted by another user space task, it
@@ -7080,6 +6529,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
if (ret)
return ret;
+ if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+ trace_array_put(tr);
+ return -EACCES;
+ }
+
ret = single_open(file, tracing_clock_show, inode->i_private);
if (ret < 0)
trace_array_put(tr);
@@ -7125,194 +6579,6 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve
return ring_buffer_event_time_stamp(buffer, rbe);
}
-struct ftrace_buffer_info {
- struct trace_iterator iter;
- void *spare;
- unsigned int spare_cpu;
- unsigned int spare_size;
- unsigned int read;
-};
-
-#ifdef CONFIG_TRACER_SNAPSHOT
-static int tracing_snapshot_open(struct inode *inode, struct file *file)
-{
- struct trace_array *tr = inode->i_private;
- struct trace_iterator *iter;
- struct seq_file *m;
- int ret;
-
- ret = tracing_check_open_get_tr(tr);
- if (ret)
- return ret;
-
- if (file->f_mode & FMODE_READ) {
- iter = __tracing_open(inode, file, true);
- if (IS_ERR(iter))
- ret = PTR_ERR(iter);
- } else {
- /* Writes still need the seq_file to hold the private data */
- ret = -ENOMEM;
- m = kzalloc_obj(*m);
- if (!m)
- goto out;
- iter = kzalloc_obj(*iter);
- if (!iter) {
- kfree(m);
- goto out;
- }
- ret = 0;
-
- iter->tr = tr;
- iter->array_buffer = &tr->snapshot_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
- m->private = iter;
- file->private_data = m;
- }
-out:
- if (ret < 0)
- trace_array_put(tr);
-
- return ret;
-}
-
-static void tracing_swap_cpu_buffer(void *tr)
-{
- update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
-}
-
-static ssize_t
-tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
- loff_t *ppos)
-{
- struct seq_file *m = filp->private_data;
- struct trace_iterator *iter = m->private;
- struct trace_array *tr = iter->tr;
- unsigned long val;
- int ret;
-
- ret = tracing_update_buffers(tr);
- if (ret < 0)
- return ret;
-
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
- if (ret)
- return ret;
-
- guard(mutex)(&trace_types_lock);
-
- if (tracer_uses_snapshot(tr->current_trace))
- return -EBUSY;
-
- local_irq_disable();
- arch_spin_lock(&tr->max_lock);
- if (tr->cond_snapshot)
- ret = -EBUSY;
- arch_spin_unlock(&tr->max_lock);
- local_irq_enable();
- if (ret)
- return ret;
-
- switch (val) {
- case 0:
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
- return -EINVAL;
- if (tr->allocated_snapshot)
- free_snapshot(tr);
- break;
- case 1:
-/* Only allow per-cpu swap if the ring buffer supports it */
-#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
- return -EINVAL;
-#endif
- if (tr->allocated_snapshot)
- ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
- &tr->array_buffer, iter->cpu_file);
-
- ret = tracing_arm_snapshot_locked(tr);
- if (ret)
- return ret;
-
- /* Now, we're going to swap */
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
- local_irq_disable();
- update_max_tr(tr, current, smp_processor_id(), NULL);
- local_irq_enable();
- } else {
- smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
- (void *)tr, 1);
- }
- tracing_disarm_snapshot(tr);
- break;
- default:
- if (tr->allocated_snapshot) {
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->snapshot_buffer);
- else
- tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file);
- }
- break;
- }
-
- if (ret >= 0) {
- *ppos += cnt;
- ret = cnt;
- }
-
- return ret;
-}
-
-static int tracing_snapshot_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = file->private_data;
- int ret;
-
- ret = tracing_release(inode, file);
-
- if (file->f_mode & FMODE_READ)
- return ret;
-
- /* If write only, the seq_file is just a stub */
- if (m)
- kfree(m->private);
- kfree(m);
-
- return 0;
-}
-
-static int tracing_buffers_open(struct inode *inode, struct file *filp);
-static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
- size_t count, loff_t *ppos);
-static int tracing_buffers_release(struct inode *inode, struct file *file);
-static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len, unsigned int flags);
-
-static int snapshot_raw_open(struct inode *inode, struct file *filp)
-{
- struct ftrace_buffer_info *info;
- int ret;
-
- /* The following checks for tracefs lockdown */
- ret = tracing_buffers_open(inode, filp);
- if (ret < 0)
- return ret;
-
- info = filp->private_data;
-
- if (tracer_uses_snapshot(info->iter.trace)) {
- tracing_buffers_release(inode, filp);
- return -EBUSY;
- }
-
- info->iter.snapshot = true;
- info->iter.array_buffer = &info->iter.tr->snapshot_buffer;
-
- return ret;
-}
-
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
-
static const struct file_operations tracing_thresh_fops = {
.open = tracing_open_generic,
.read = tracing_thresh_read,
@@ -7320,16 +6586,6 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
-#ifdef CONFIG_TRACER_MAX_TRACE
-static const struct file_operations tracing_max_lat_fops = {
- .open = tracing_open_generic_tr,
- .read = tracing_max_lat_read,
- .write = tracing_max_lat_write,
- .llseek = generic_file_llseek,
- .release = tracing_release_generic_tr,
-};
-#endif
-
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic_tr,
.read = tracing_set_trace_read,
@@ -7416,24 +6672,6 @@ static const struct file_operations last_boot_fops = {
.release = tracing_seq_release,
};
-#ifdef CONFIG_TRACER_SNAPSHOT
-static const struct file_operations snapshot_fops = {
- .open = tracing_snapshot_open,
- .read = seq_read,
- .write = tracing_snapshot_write,
- .llseek = tracing_lseek,
- .release = tracing_snapshot_release,
-};
-
-static const struct file_operations snapshot_raw_fops = {
- .open = snapshot_raw_open,
- .read = tracing_buffers_read,
- .release = tracing_buffers_release,
- .splice_read = tracing_buffers_splice_read,
-};
-
-#endif /* CONFIG_TRACER_SNAPSHOT */
-
/*
* trace_min_max_write - Write a u64 value to a trace_min_max_param struct
* @filp: The active open file structure
@@ -7793,7 +7031,7 @@ static const struct file_operations tracing_err_log_fops = {
.release = tracing_err_log_release,
};
-static int tracing_buffers_open(struct inode *inode, struct file *filp)
+int tracing_buffers_open(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct ftrace_buffer_info *info;
@@ -7841,9 +7079,8 @@ tracing_buffers_poll(struct file *filp, poll_table *poll_table)
return trace_poll(iter, filp, poll_table);
}
-static ssize_t
-tracing_buffers_read(struct file *filp, char __user *ubuf,
- size_t count, loff_t *ppos)
+ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
@@ -7944,7 +7181,7 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id)
return 0;
}
-static int tracing_buffers_release(struct inode *inode, struct file *file)
+int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
@@ -8018,10 +7255,9 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
spd->partial[i].private = 0;
}
-static ssize_t
-tracing_buffers_splice_read(struct file *file, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
@@ -8175,43 +7411,17 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0;
}
-#ifdef CONFIG_TRACER_SNAPSHOT
-static int get_snapshot_map(struct trace_array *tr)
+/*
+ * This is called when a VMA is duplicated (e.g., on fork()) to increment
+ * the user_mapped counter without remapping pages.
+ */
+static void tracing_buffers_mmap_open(struct vm_area_struct *vma)
{
- int err = 0;
-
- /*
- * Called with mmap_lock held. lockdep would be unhappy if we would now
- * take trace_types_lock. Instead use the specific
- * snapshot_trigger_lock.
- */
- spin_lock(&tr->snapshot_trigger_lock);
-
- if (tr->snapshot || tr->mapped == UINT_MAX)
- err = -EBUSY;
- else
- tr->mapped++;
-
- spin_unlock(&tr->snapshot_trigger_lock);
-
- /* Wait for update_max_tr() to observe iter->tr->mapped */
- if (tr->mapped == 1)
- synchronize_rcu();
-
- return err;
+ struct ftrace_buffer_info *info = vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+ ring_buffer_map_dup(iter->array_buffer->buffer, iter->cpu_file);
}
-static void put_snapshot_map(struct trace_array *tr)
-{
- spin_lock(&tr->snapshot_trigger_lock);
- if (!WARN_ON(!tr->mapped))
- tr->mapped--;
- spin_unlock(&tr->snapshot_trigger_lock);
-}
-#else
-static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
-static inline void put_snapshot_map(struct trace_array *tr) { }
-#endif
static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
{
@@ -8232,6 +7442,7 @@ static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long a
}
static const struct vm_operations_struct tracing_buffers_vmops = {
+ .open = tracing_buffers_mmap_open,
.close = tracing_buffers_mmap_close,
.may_split = tracing_buffers_may_split,
};
@@ -8380,170 +7591,6 @@ static const struct file_operations tracing_dyn_info_fops = {
};
#endif /* CONFIG_DYNAMIC_FTRACE */
-#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
-static void
-ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
- struct trace_array *tr, struct ftrace_probe_ops *ops,
- void *data)
-{
- tracing_snapshot_instance(tr);
-}
-
-static void
-ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
- struct trace_array *tr, struct ftrace_probe_ops *ops,
- void *data)
-{
- struct ftrace_func_mapper *mapper = data;
- long *count = NULL;
-
- if (mapper)
- count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
-
- if (count) {
-
- if (*count <= 0)
- return;
-
- (*count)--;
- }
-
- tracing_snapshot_instance(tr);
-}
-
-static int
-ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *data)
-{
- struct ftrace_func_mapper *mapper = data;
- long *count = NULL;
-
- seq_printf(m, "%ps:", (void *)ip);
-
- seq_puts(m, "snapshot");
-
- if (mapper)
- count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
-
- if (count)
- seq_printf(m, ":count=%ld\n", *count);
- else
- seq_puts(m, ":unlimited\n");
-
- return 0;
-}
-
-static int
-ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
- unsigned long ip, void *init_data, void **data)
-{
- struct ftrace_func_mapper *mapper = *data;
-
- if (!mapper) {
- mapper = allocate_ftrace_func_mapper();
- if (!mapper)
- return -ENOMEM;
- *data = mapper;
- }
-
- return ftrace_func_mapper_add_ip(mapper, ip, init_data);
-}
-
-static void
-ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
- unsigned long ip, void *data)
-{
- struct ftrace_func_mapper *mapper = data;
-
- if (!ip) {
- if (!mapper)
- return;
- free_ftrace_func_mapper(mapper, NULL);
- return;
- }
-
- ftrace_func_mapper_remove_ip(mapper, ip);
-}
-
-static struct ftrace_probe_ops snapshot_probe_ops = {
- .func = ftrace_snapshot,
- .print = ftrace_snapshot_print,
-};
-
-static struct ftrace_probe_ops snapshot_count_probe_ops = {
- .func = ftrace_count_snapshot,
- .print = ftrace_snapshot_print,
- .init = ftrace_snapshot_init,
- .free = ftrace_snapshot_free,
-};
-
-static int
-ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
- char *glob, char *cmd, char *param, int enable)
-{
- struct ftrace_probe_ops *ops;
- void *count = (void *)-1;
- char *number;
- int ret;
-
- if (!tr)
- return -ENODEV;
-
- /* hash funcs only work with set_ftrace_filter */
- if (!enable)
- return -EINVAL;
-
- ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
-
- if (glob[0] == '!') {
- ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
- if (!ret)
- tracing_disarm_snapshot(tr);
-
- return ret;
- }
-
- if (!param)
- goto out_reg;
-
- number = strsep(&param, ":");
-
- if (!strlen(number))
- goto out_reg;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, (unsigned long *)&count);
- if (ret)
- return ret;
-
- out_reg:
- ret = tracing_arm_snapshot(tr);
- if (ret < 0)
- return ret;
-
- ret = register_ftrace_function_probe(glob, tr, ops, count);
- if (ret < 0)
- tracing_disarm_snapshot(tr);
-
- return ret < 0 ? ret : 0;
-}
-
-static struct ftrace_func_command ftrace_snapshot_cmd = {
- .name = "snapshot",
- .func = ftrace_trace_snapshot_callback,
-};
-
-static __init int register_snapshot_cmd(void)
-{
- return register_ftrace_command(&ftrace_snapshot_cmd);
-}
-#else
-static inline __init int register_snapshot_cmd(void) { return 0; }
-#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
-
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
/* Top directory uses NULL as the parent */
@@ -8576,7 +7623,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
return tr->percpu_dir;
}
-static struct dentry *
+struct dentry *
trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
void *data, long cpu, const struct file_operations *fops)
{
@@ -9336,8 +8383,7 @@ static void setup_trace_scratch(struct trace_array *tr,
memset(tscratch, 0, size);
}
-static int
-allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
+int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
enum ring_buffer_flags rb_flags;
struct trace_scratch *tscratch;
@@ -9376,8 +8422,8 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
}
/* Allocate the first page for all buffers */
- set_buffer_entries(&tr->array_buffer,
- ring_buffer_size(tr->array_buffer.buffer, 0));
+ trace_set_buffer_entries(&tr->array_buffer,
+ ring_buffer_size(tr->array_buffer.buffer, 0));
return 0;
}
@@ -9392,7 +8438,7 @@ static void free_trace_buffer(struct array_buffer *buf)
}
}
-static int allocate_trace_buffers(struct trace_array *tr, int size)
+static int allocate_trace_buffers(struct trace_array *tr, unsigned long size)
{
int ret;
@@ -9400,23 +8446,11 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
if (ret)
return ret;
-#ifdef CONFIG_TRACER_SNAPSHOT
- /* Fix mapped buffer trace arrays do not have snapshot buffers */
- if (tr->range_addr_start)
- return 0;
-
- ret = allocate_trace_buffer(tr, &tr->snapshot_buffer,
- allocate_snapshot ? size : 1);
- if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
+ ret = trace_allocate_snapshot(tr, size);
+ if (MEM_FAIL(ret, "Failed to allocate trace buffer\n"))
free_trace_buffer(&tr->array_buffer);
- return -ENOMEM;
- }
- tr->allocated_snapshot = allocate_snapshot;
-
- allocate_snapshot = false;
-#endif
- return 0;
+ return ret;
}
static void free_trace_buffers(struct trace_array *tr)
@@ -9497,8 +8531,8 @@ struct trace_array *trace_array_find_get(const char *instance)
guard(mutex)(&trace_types_lock);
tr = trace_array_find(instance);
- if (tr)
- tr->ref++;
+ if (tr && __trace_array_get(tr) < 0)
+ tr = NULL;
return tr;
}
@@ -9595,6 +8629,8 @@ trace_array_create_systems(const char *name, const char *systems,
if (ftrace_allocate_ftrace_ops(tr) < 0)
goto out_free_tr;
+ trace_array_init_autoremove(tr);
+
ftrace_init_trace_array(tr);
init_trace_flags_index(tr);
@@ -9705,7 +8741,9 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr->name && strcmp(tr->name, name) == 0) {
- tr->ref++;
+ /* if this fails, @tr is going to be removed. */
+ if (__trace_array_get(tr) < 0)
+ tr = NULL;
return tr;
}
}
@@ -9731,18 +8769,20 @@ static int __remove_instance(struct trace_array *tr)
list_del(&tr->list);
- /* Disable all the flags that were enabled coming in */
- for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
- if ((1ULL << i) & ZEROED_TRACE_FLAGS)
- set_tracer_flag(tr, 1ULL << i, 0);
- }
-
if (printk_trace == tr)
update_printk_trace(&global_trace);
+ /* Must be done before disabling all the flags */
if (update_marker_trace(tr, 0))
synchronize_rcu();
+ /* Disable all the flags that were enabled coming in */
+ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
+ if ((1ULL << i) & ZEROED_TRACE_FLAGS)
+ set_tracer_flag(tr, 1ULL << i, 0);
+ }
+
+ trace_array_cancel_autoremove(tr);
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9835,17 +8875,22 @@ static __init void create_trace_instances(struct dentry *d_tracer)
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
+ umode_t writable_mode = TRACE_MODE_WRITE;
int cpu;
+ if (trace_array_is_readonly(tr))
+ writable_mode = TRACE_MODE_READ;
+
trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
- tr, &show_traces_fops);
+ tr, &show_traces_fops);
- trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
- tr, &set_tracer_fops);
+ trace_create_file("current_tracer", writable_mode, d_tracer,
+ tr, &set_tracer_fops);
- trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
+ trace_create_file("tracing_cpumask", writable_mode, d_tracer,
tr, &tracing_cpumask_fops);
+ /* Options are used for changing print-format even for readonly instance. */
trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_iter_fops);
@@ -9855,12 +8900,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
tr, &tracing_pipe_fops);
- trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
+ trace_create_file("buffer_size_kb", writable_mode, d_tracer,
tr, &tracing_entries_fops);
trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
tr, &tracing_total_entries_fops);
+ trace_create_file("trace_clock", writable_mode, d_tracer, tr,
+ &trace_clock_fops);
+
+ trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
+ &trace_time_stamp_mode_fops);
+
+ tr->buffer_percent = 50;
+
+ trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
+ create_trace_options_dir(tr);
+
+ if (tr->range_addr_start)
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
+
+ for_each_tracing_cpu(cpu)
+ tracing_init_tracefs_percpu(tr, cpu);
+
+ /* Read-only instance has above files only. */
+ if (trace_array_is_readonly(tr))
+ return;
+
trace_create_file("free_buffer", 0200, d_tracer,
tr, &tracing_free_buffer_fops);
@@ -9872,49 +8941,29 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
- trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
- &trace_clock_fops);
-
- trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
- tr, &rb_simple_fops);
-
- trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
- &trace_time_stamp_mode_fops);
-
- tr->buffer_percent = 50;
-
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_percent_fops);
-
- trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_subbuf_size_fops);
+ tr, &buffer_percent_fops);
trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
- tr, &tracing_syscall_buf_fops);
+ tr, &tracing_syscall_buf_fops);
- create_trace_options_dir(tr);
+ trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
+ tr, &rb_simple_fops);
trace_create_maxlat_file(tr, d_tracer);
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
- if (tr->range_addr_start) {
- trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
- tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- } else {
+ if (!tr->range_addr_start)
trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
tr, &snapshot_fops);
#endif
- }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
- for_each_tracing_cpu(cpu)
- tracing_init_tracefs_percpu(tr, cpu);
-
ftrace_init_tracefs(tr, d_tracer);
}
@@ -10523,47 +9572,6 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
return done;
}
-#ifdef CONFIG_TRACER_SNAPSHOT
-__init static bool tr_needs_alloc_snapshot(const char *name)
-{
- char *test;
- int len = strlen(name);
- bool ret;
-
- if (!boot_snapshot_index)
- return false;
-
- if (strncmp(name, boot_snapshot_info, len) == 0 &&
- boot_snapshot_info[len] == '\t')
- return true;
-
- test = kmalloc(strlen(name) + 3, GFP_KERNEL);
- if (!test)
- return false;
-
- sprintf(test, "\t%s\t", name);
- ret = strstr(boot_snapshot_info, test) == NULL;
- kfree(test);
- return ret;
-}
-
-__init static void do_allocate_snapshot(const char *name)
-{
- if (!tr_needs_alloc_snapshot(name))
- return;
-
- /*
- * When allocate_snapshot is set, the next call to
- * allocate_trace_buffers() (called by trace_array_get_by_name())
- * will allocate the snapshot buffer. That will also clear
- * this flag.
- */
- allocate_snapshot = true;
-}
-#else
-static inline void do_allocate_snapshot(const char *name) { }
-#endif
-
__init static int backup_instance_area(const char *backup,
unsigned long *addr, phys_addr_t *size)
{
@@ -10713,8 +9721,7 @@ __init static void enable_instances(void)
}
} else {
/* Only non mapped buffers have snapshot buffers */
- if (IS_ENABLED(CONFIG_TRACER_SNAPSHOT))
- do_allocate_snapshot(name);
+ do_allocate_snapshot(name);
}
tr = trace_array_create_systems(name, NULL, addr, size);
@@ -10740,23 +9747,47 @@ __init static void enable_instances(void)
/*
* Backup buffers can be freed but need vfree().
*/
- if (backup)
- tr->flags |= TRACE_ARRAY_FL_VMALLOC;
+ if (backup) {
+ tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY;
+ trace_array_start_autoremove();
+ }
if (start || backup) {
tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
tr->range_name = no_free_ptr(rname);
}
+ /*
+ * Save the events to start and enabled them after all boot instances
+ * have been created.
+ */
+ tr->boot_events = curr_str;
+ }
+
+ /* Enable the events after all boot instances have been created */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+
+ if (!tr->boot_events || !(*tr->boot_events)) {
+ tr->boot_events = NULL;
+ continue;
+ }
+
+ curr_str = tr->boot_events;
+
+ /* Clear the instance if this is a persistent buffer */
+ if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)
+ update_last_data(tr);
+
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
}
+ tr->boot_events = NULL;
}
}
__init static int tracer_alloc_buffers(void)
{
- int ring_buf_size;
+ unsigned long ring_buf_size;
int ret = -ENOMEM;
@@ -10906,24 +9937,6 @@ struct trace_array *trace_get_global_array(void)
}
#endif
-void __init ftrace_boot_snapshot(void)
-{
-#ifdef CONFIG_TRACER_SNAPSHOT
- struct trace_array *tr;
-
- if (!snapshot_at_boot)
- return;
-
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (!tr->allocated_snapshot)
- continue;
-
- tracing_snapshot_instance(tr);
- trace_array_puts(tr, "** Boot snapshot taken **\n");
- }
-#endif
-}
-
void __init early_trace_init(void)
{
if (tracepoint_printk) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b8f3804586a0..80fe152af1dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -264,6 +264,7 @@ static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_li
typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
+#ifdef CONFIG_TRACER_SNAPSHOT
/**
* struct cond_snapshot - conditional snapshot data and callback
*
@@ -306,6 +307,7 @@ struct cond_snapshot {
void *cond_data;
cond_update_fn_t update;
};
+#endif /* CONFIG_TRACER_SNAPSHOT */
/*
* struct trace_func_repeats - used to keep track of the consecutive
@@ -405,7 +407,10 @@ struct trace_array {
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
- const char *system_names;
+ union {
+ const char *system_names;
+ char *boot_events;
+ };
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
@@ -453,6 +458,12 @@ struct trace_array {
* we do not waste memory on systems that are not using tracing.
*/
bool ring_buffer_expanded;
+ /*
+ * If the ring buffer is a read only backup instance, it will be
+ * removed after dumping all data via pipe, because no readable data.
+ */
+ bool free_on_close;
+ struct work_struct autoremove_work;
};
enum {
@@ -462,6 +473,7 @@ enum {
TRACE_ARRAY_FL_MOD_INIT = BIT(3),
TRACE_ARRAY_FL_MEMMAP = BIT(4),
TRACE_ARRAY_FL_VMALLOC = BIT(5),
+ TRACE_ARRAY_FL_RDONLY = BIT(6),
};
#ifdef CONFIG_MODULES
@@ -491,6 +503,12 @@ extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long
extern struct trace_array *printk_trace;
+static inline bool trace_array_is_readonly(struct trace_array *tr)
+{
+ /* backup instance is read only. */
+ return tr->flags & TRACE_ARRAY_FL_RDONLY;
+}
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
@@ -675,6 +693,7 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release(struct inode *inode, struct file *file);
int tracing_release_generic_tr(struct inode *inode, struct file *file);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
@@ -684,12 +703,54 @@ void tracer_tracing_on(struct trace_array *tr);
void tracer_tracing_off(struct trace_array *tr);
void tracer_tracing_disable(struct trace_array *tr);
void tracer_tracing_enable(struct trace_array *tr);
+int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops);
+struct dentry *trace_create_cpu_file(const char *name,
+ umode_t mode,
+ struct dentry *parent,
+ void *data,
+ long cpu,
+ const struct file_operations *fops);
+
+struct trace_iterator *__tracing_open(struct inode *inode, struct file *file,
+ bool snapshot);
+int tracing_buffers_open(struct inode *inode, struct file *filp);
+ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos);
+int tracing_buffers_release(struct inode *inode, struct file *file);
+ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+
+ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+
+void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val);
+/*
+ * Should be used after trace_array_get(), trace_types_lock
+ * ensures that i_cdev was already initialized.
+ */
+static inline int tracing_get_cpu(struct inode *inode)
+{
+ if (inode->i_cdev) /* See trace_create_cpu_file() */
+ return (long)inode->i_cdev - 1;
+ return RING_BUFFER_ALL_CPUS;
+}
+void tracing_reset_cpu(struct array_buffer *buf, int cpu);
+
+struct ftrace_buffer_info {
+ struct trace_iterator iter;
+ void *spare;
+ unsigned int spare_cpu;
+ unsigned int spare_size;
+ unsigned int read;
+};
/**
* tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu
@@ -806,13 +867,13 @@ void update_max_tr_single(struct trace_array *tr,
#if defined(CONFIG_TRACER_MAX_TRACE) && defined(CONFIG_FSNOTIFY)
# define LATENCY_FS_NOTIFY
#endif
+#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
#else
static inline void latency_fsnotify(struct trace_array *tr) { }
#endif
-#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_STACKTRACE
void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
@@ -828,11 +889,15 @@ static inline bool tracer_uses_snapshot(struct tracer *tracer)
{
return tracer->use_max_tr;
}
+void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer);
#else
static inline bool tracer_uses_snapshot(struct tracer *tracer)
{
return false;
}
+static inline void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer) { }
#endif
void trace_last_func_repeats(struct trace_array *tr,
@@ -862,6 +927,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
+void __init trace_append_boot_param(char *buf, const char *str,
+ char sep, int size);
extern void trace_set_ring_buffer_expanded(struct trace_array *tr);
extern bool tracing_selftest_disabled;
@@ -1802,11 +1869,6 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr,
const char *system,
const char *event);
-static inline void *event_file_data(struct file *filp)
-{
- return READ_ONCE(file_inode(filp)->i_private);
-}
-
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
@@ -1827,12 +1889,22 @@ static inline struct trace_event_file *event_file_file(struct file *filp)
struct trace_event_file *file;
lockdep_assert_held(&event_mutex);
- file = READ_ONCE(file_inode(filp)->i_private);
+ file = file_inode(filp)->i_private;
if (!file || file->flags & EVENT_FILE_FL_FREED)
return NULL;
return file;
}
+static inline void *event_file_data(struct file *filp)
+{
+ struct trace_event_file *file;
+
+ lockdep_assert_held(&event_mutex);
+ file = file_inode(filp)->i_private;
+ WARN_ON(!file || file->flags & EVENT_FILE_FL_FREED);
+ return file;
+}
+
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
@@ -2135,12 +2207,6 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops)
extern int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable);
-extern int tracing_alloc_snapshot(void);
-extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
-extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
-
-extern int tracing_snapshot_cond_disable(struct trace_array *tr);
-extern void *tracing_cond_snapshot_data(struct trace_array *tr);
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
@@ -2228,19 +2294,71 @@ static inline void trace_event_update_all(struct trace_eval_map **map, int len)
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
+extern const struct file_operations snapshot_fops;
+extern const struct file_operations snapshot_raw_fops;
+
+/* Used when creating instances */
+int trace_allocate_snapshot(struct trace_array *tr, int size);
+
+int tracing_alloc_snapshot(void);
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
+int tracing_snapshot_cond_disable(struct trace_array *tr);
+void *tracing_cond_snapshot_data(struct trace_array *tr);
void tracing_snapshot_instance(struct trace_array *tr);
int tracing_alloc_snapshot_instance(struct trace_array *tr);
+int tracing_arm_snapshot_locked(struct trace_array *tr);
int tracing_arm_snapshot(struct trace_array *tr);
void tracing_disarm_snapshot(struct trace_array *tr);
-#else
+void free_snapshot(struct trace_array *tr);
+void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter);
+int get_snapshot_map(struct trace_array *tr);
+void put_snapshot_map(struct trace_array *tr);
+int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id);
+__init void do_allocate_snapshot(const char *name);
+# ifdef CONFIG_DYNAMIC_FTRACE
+__init int register_snapshot_cmd(void);
+# else
+static inline int register_snapshot_cmd(void) { return 0; }
+# endif
+#else /* !CONFIG_TRACER_SNAPSHOT */
+static inline int trace_allocate_snapshot(struct trace_array *tr, int size) { return 0; }
static inline void tracing_snapshot_instance(struct trace_array *tr) { }
static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
return 0;
}
+static inline int tracing_arm_snapshot_locked(struct trace_array *tr) { return -EBUSY; }
static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; }
static inline void tracing_disarm_snapshot(struct trace_array *tr) { }
-#endif
+static inline void free_snapshot(struct trace_array *tr) {}
+static inline void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
+}
+static inline void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ return NULL;
+}
+static inline int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
+{
+ return -ENODEV;
+}
+static inline int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ return false;
+}
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ /* Should never be called */
+ WARN_ONCE(1, "Snapshot print function called without snapshot configured");
+}
+static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
+static inline void put_snapshot_map(struct trace_array *tr) { }
+static inline void do_allocate_snapshot(const char *name) { }
+static inline int register_snapshot_cmd(void) { return 0; }
+#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_PREEMPT_TRACER
void tracer_preempt_on(unsigned long a0, unsigned long a1);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index dbe29b4c6a7a..2ca2541c8a58 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -61,7 +61,8 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
v = memparse(p, NULL);
if (v < PAGE_SIZE)
pr_err("Buffer size is too small: %s\n", p);
- if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
+ if (trace_array_is_readonly(tr) ||
+ tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
pr_err("Failed to resize trace buffer to %s\n", p);
}
@@ -597,7 +598,7 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
p = xbc_node_find_value(node, "tracer", NULL);
if (p && *p != '\0') {
- if (tracing_set_tracer(tr, p) < 0)
+ if (trace_array_is_readonly(tr) || tracing_set_tracer(tr, p) < 0)
pr_err("Failed to set given tracer: %s\n", p);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9928da636c9d..c46e623e7e0d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1039,6 +1039,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
+ guard(preempt)();
pid_list = rcu_dereference_raw(tr->filtered_pids);
trace_filter_add_remove_task(pid_list, NULL, task);
@@ -1054,6 +1055,7 @@ event_filter_pid_sched_process_fork(void *data,
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
+ guard(preempt)();
pid_list = rcu_dereference_sched(tr->filtered_pids);
trace_filter_add_remove_task(pid_list, self, task);
@@ -1399,6 +1401,9 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
{
int ret;
+ if (trace_array_is_readonly(tr))
+ return -EACCES;
+
mutex_lock(&event_mutex);
ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
mutex_unlock(&event_mutex);
@@ -1716,7 +1721,7 @@ static int t_show_filters(struct seq_file *m, void *v)
len = get_call_len(call);
- seq_printf(m, "%s:%s%*.s%s\n", call->class->system,
+ seq_printf(m, "%s:%s%*s%s\n", call->class->system,
trace_event_name(call), len, "", filter->filter_string);
return 0;
@@ -1748,7 +1753,7 @@ static int t_show_triggers(struct seq_file *m, void *v)
len = get_call_len(call);
list_for_each_entry_rcu(data, &file->triggers, list) {
- seq_printf(m, "%s:%s%*.s", call->class->system,
+ seq_printf(m, "%s:%s%*s", call->class->system,
trace_event_name(call), len, "");
data->cmd_ops->print(m, data);
@@ -2182,12 +2187,12 @@ static int trace_format_open(struct inode *inode, struct file *file)
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int id = (long)event_file_data(filp);
+ /* id is directly in i_private and available for inode's lifetime. */
+ int id = (long)file_inode(filp)->i_private;
char buf[32];
int len;
- if (unlikely(!id))
- return -ENODEV;
+ WARN_ON(!id);
len = sprintf(buf, "%d\n", id);
@@ -2245,12 +2250,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
file = event_file_file(filp);
- if (file) {
- if (file->flags & EVENT_FILE_FL_FREED)
- err = -ENODEV;
- else
- err = apply_event_filter(file, buf);
- }
+ if (file)
+ err = apply_event_filter(file, buf);
mutex_unlock(&event_mutex);
kfree(buf);
@@ -2971,8 +2972,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- /* ftrace only has directories no files */
- if (strcmp(name, "ftrace") == 0)
+ /* ftrace only has directories no files, readonly instance too. */
+ if (strcmp(name, "ftrace") == 0 || trace_array_is_readonly(tr))
nr_entries = 0;
else
nr_entries = ARRAY_SIZE(system_entries);
@@ -3137,28 +3138,30 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
int ret;
static struct eventfs_entry event_entries[] = {
{
- .name = "enable",
+ .name = "format",
.callback = event_callback,
- .release = event_release,
},
+#ifdef CONFIG_PERF_EVENTS
{
- .name = "filter",
+ .name = "id",
.callback = event_callback,
},
+#endif
+#define NR_RO_EVENT_ENTRIES (1 + IS_ENABLED(CONFIG_PERF_EVENTS))
+/* Readonly files must be above this line and counted by NR_RO_EVENT_ENTRIES. */
{
- .name = "trigger",
+ .name = "enable",
.callback = event_callback,
+ .release = event_release,
},
{
- .name = "format",
+ .name = "filter",
.callback = event_callback,
},
-#ifdef CONFIG_PERF_EVENTS
{
- .name = "id",
+ .name = "trigger",
.callback = event_callback,
},
-#endif
#ifdef CONFIG_HIST_TRIGGERS
{
.name = "hist",
@@ -3191,7 +3194,10 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
if (!e_events)
return -ENOMEM;
- nr_entries = ARRAY_SIZE(event_entries);
+ if (trace_array_is_readonly(tr))
+ nr_entries = NR_RO_EVENT_ENTRIES;
+ else
+ nr_entries = ARRAY_SIZE(event_entries);
name = trace_event_name(call);
ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
@@ -3677,20 +3683,27 @@ static struct boot_triggers {
} bootup_triggers[MAX_BOOT_TRIGGERS];
static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static int boot_trigger_buf_len;
static int nr_boot_triggers;
static __init int setup_trace_triggers(char *str)
{
char *trigger;
char *buf;
+ int len = boot_trigger_buf_len;
int i;
- strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+ if (len >= COMMAND_LINE_SIZE)
+ return 1;
+
+ strscpy(bootup_trigger_buf + len, str, COMMAND_LINE_SIZE - len);
trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event triggers");
- buf = bootup_trigger_buf;
- for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+ buf = bootup_trigger_buf + len;
+ boot_trigger_buf_len += strlen(buf) + 1;
+
+ for (i = nr_boot_triggers; i < MAX_BOOT_TRIGGERS; i++) {
trigger = strsep(&buf, ",");
if (!trigger)
break;
@@ -4491,7 +4504,11 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
static __init int setup_trace_event(char *str)
{
- strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ if (bootup_event_buf[0] != '\0')
+ strlcat(bootup_event_buf, ",", COMMAND_LINE_SIZE);
+
+ strlcat(bootup_event_buf, str, COMMAND_LINE_SIZE);
+
trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event tracing");
@@ -4530,31 +4547,44 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
int nr_entries;
static struct eventfs_entry events_entries[] = {
{
- .name = "enable",
+ .name = "header_page",
.callback = events_callback,
},
{
- .name = "header_page",
+ .name = "header_event",
.callback = events_callback,
},
+#define NR_RO_TOP_ENTRIES 2
+/* Readonly files must be above this line and counted by NR_RO_TOP_ENTRIES. */
{
- .name = "header_event",
+ .name = "enable",
.callback = events_callback,
},
};
- entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_fops);
- if (!entry)
- return -ENOMEM;
+ if (!trace_array_is_readonly(tr)) {
+ entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry)
+ return -ENOMEM;
- trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
- &ftrace_show_event_filters_fops);
+ /* There are not as crucial, just warn if they are not created */
+ trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
+ &ftrace_show_event_filters_fops);
- trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
- &ftrace_show_event_triggers_fops);
+ trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
+ &ftrace_show_event_triggers_fops);
- nr_entries = ARRAY_SIZE(events_entries);
+ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_pid_fops);
+
+ trace_create_file("set_event_notrace_pid",
+ TRACE_MODE_WRITE, parent, tr,
+ &ftrace_set_event_notrace_pid_fops);
+ nr_entries = ARRAY_SIZE(events_entries);
+ } else {
+ nr_entries = NR_RO_TOP_ENTRIES;
+ }
e_events = eventfs_create_events_dir("events", parent, events_entries,
nr_entries, tr);
@@ -4563,15 +4593,6 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
- /* There are not as crucial, just warn if they are not created */
-
- trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_pid_fops);
-
- trace_create_file("set_event_notrace_pid",
- TRACE_MODE_WRITE, parent, tr,
- &ftrace_set_event_notrace_pid_fops);
-
tr->event_dir = e_events;
return 0;
@@ -4668,26 +4689,22 @@ static __init int event_trace_memsetup(void)
return 0;
}
-__init void
-early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
+/*
+ * Helper function to enable or disable a comma-separated list of events
+ * from the bootup buffer.
+ */
+static __init void __early_set_events(struct trace_array *tr, char *buf, bool enable)
{
char *token;
- int ret;
-
- while (true) {
- token = strsep(&buf, ",");
-
- if (!token)
- break;
+ while ((token = strsep(&buf, ","))) {
if (*token) {
- /* Restarting syscalls requires that we stop them first */
- if (disable_first)
+ if (enable) {
+ if (ftrace_set_clr_event(tr, token, 1))
+ pr_warn("Failed to enable trace event: %s\n", token);
+ } else {
ftrace_set_clr_event(tr, token, 0);
-
- ret = ftrace_set_clr_event(tr, token, 1);
- if (ret)
- pr_warn("Failed to enable trace event: %s\n", token);
+ }
}
/* Put back the comma to allow this to be called again */
@@ -4696,6 +4713,32 @@ early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
}
}
+/**
+ * early_enable_events - enable events from the bootup buffer
+ * @tr: The trace array to enable the events in
+ * @buf: The buffer containing the comma separated list of events
+ * @disable_first: If true, disable all events in @buf before enabling them
+ *
+ * This function enables events from the bootup buffer. If @disable_first
+ * is true, it will first disable all events in the buffer before enabling
+ * them.
+ *
+ * For syscall events, which rely on a global refcount to register the
+ * SYSCALL_WORK_SYSCALL_TRACEPOINT flag (especially for pid 1), we must
+ * ensure the refcount hits zero before re-enabling them. A simple
+ * "disable then enable" per-event is not enough if multiple syscalls are
+ * used, as the refcount will stay above zero. Thus, we need a two-phase
+ * approach: disable all, then enable all.
+ */
+__init void
+early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
+{
+ if (disable_first)
+ __early_set_events(tr, buf, false);
+
+ __early_set_events(tr, buf, true);
+}
+
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 73ea180cad55..0dbbf6cca9bc 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1361,12 +1361,17 @@ static const char *hist_field_name(struct hist_field *field,
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
static char full_name[MAX_FILTER_STR_VAL];
+ static char *fmt;
+ int len;
+
+ fmt = field->flags & HIST_FIELD_FL_VAR_REF ? "%s.%s.$%s" : "%s.%s.%s";
+
+ len = snprintf(full_name, sizeof(full_name), fmt,
+ field->system, field->event_name,
+ field->name);
+ if (len >= sizeof(full_name))
+ return NULL;
- strcat(full_name, field->system);
- strcat(full_name, ".");
- strcat(full_name, field->event_name);
- strcat(full_name, ".");
- strcat(full_name, field->name);
field_name = full_name;
} else
field_name = field->name;
@@ -1740,9 +1745,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
static void expr_field_str(struct hist_field *field, char *expr)
{
- if (field->flags & HIST_FIELD_FL_VAR_REF)
- strcat(expr, "$");
- else if (field->flags & HIST_FIELD_FL_CONST) {
+ if (field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (!field->system)
+ strcat(expr, "$");
+ } else if (field->flags & HIST_FIELD_FL_CONST) {
char str[HIST_CONST_DIGITS_MAX];
snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant);
@@ -5836,8 +5842,6 @@ static int event_hist_open(struct inode *inode, struct file *file)
hist_file->file = file;
hist_file->last_act = get_hist_hit_count(event_file);
- /* Clear private_data to avoid warning in single_open() */
- file->private_data = NULL;
ret = single_open(file, hist_show, hist_file);
if (ret) {
kfree(hist_file);
@@ -6126,8 +6130,6 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- /* Clear private_data to avoid warning in single_open() */
- file->private_data = NULL;
ret = single_open(file, hist_debug_show, file);
if (ret)
tracing_release_file_tr(inode, file);
@@ -6158,7 +6160,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
else if (field_name) {
if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
hist_field->flags & HIST_FIELD_FL_ALIAS)
- seq_putc(m, '$');
+ if (!hist_field->system)
+ seq_putc(m, '$');
seq_printf(m, "%s", field_name);
} else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
seq_puts(m, "common_timestamp");
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 8bb95b2a6fcf..39ac4eba0702 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -395,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
n_u64++;
} else {
struct trace_print_flags __flags[] = {
- __def_gfpflag_names, {-1, NULL} };
+ __def_gfpflag_names };
char *space = (i == se->n_fields - 1 ? "" : " ");
print_synth_event_num_val(s, print_fmt,
@@ -408,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
trace_seq_puts(s, " (");
trace_print_flags_seq(s, "|",
entry->fields[n_u64].as_u64,
- __flags);
+ __flags, ARRAY_SIZE(__flags));
trace_seq_putc(s, ')');
}
n_u64++;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index fecbd679d432..655db2e82513 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread;
static struct llist_head trigger_data_free_list;
static DEFINE_MUTEX(trigger_data_kthread_mutex);
+static int trigger_kthread_fn(void *ignore);
+
+static void trigger_create_kthread_locked(void)
+{
+ lockdep_assert_held(&trigger_data_kthread_mutex);
+
+ if (!trigger_kthread) {
+ struct task_struct *kthread;
+
+ kthread = kthread_create(trigger_kthread_fn, NULL,
+ "trigger_data_free");
+ if (!IS_ERR(kthread))
+ WRITE_ONCE(trigger_kthread, kthread);
+ }
+}
+
+static void trigger_data_free_queued_locked(void)
+{
+ struct event_trigger_data *data, *tmp;
+ struct llist_node *llnodes;
+
+ lockdep_assert_held(&trigger_data_kthread_mutex);
+
+ llnodes = llist_del_all(&trigger_data_free_list);
+ if (!llnodes)
+ return;
+
+ tracepoint_synchronize_unregister();
+
+ llist_for_each_entry_safe(data, tmp, llnodes, llist)
+ kfree(data);
+}
+
/* Bulk garbage collection of event_trigger_data elements */
static int trigger_kthread_fn(void *ignore)
{
@@ -50,33 +83,56 @@ static int trigger_kthread_fn(void *ignore)
void trigger_data_free(struct event_trigger_data *data)
{
+ if (!data)
+ return;
+
if (data->cmd_ops->set_filter)
data->cmd_ops->set_filter(NULL, data, NULL);
+ /*
+ * Boot-time trigger registration can fail before kthread creation
+ * works. Keep the deferred-free semantics during boot and let late
+ * init start the kthread to drain the list.
+ */
+ if (system_state == SYSTEM_BOOTING && !trigger_kthread) {
+ llist_add(&data->llist, &trigger_data_free_list);
+ return;
+ }
+
if (unlikely(!trigger_kthread)) {
guard(mutex)(&trigger_data_kthread_mutex);
+
+ trigger_create_kthread_locked();
/* Check again after taking mutex */
if (!trigger_kthread) {
- struct task_struct *kthread;
-
- kthread = kthread_create(trigger_kthread_fn, NULL,
- "trigger_data_free");
- if (!IS_ERR(kthread))
- WRITE_ONCE(trigger_kthread, kthread);
+ llist_add(&data->llist, &trigger_data_free_list);
+ /* Drain the queued frees synchronously if creation failed. */
+ trigger_data_free_queued_locked();
+ return;
}
}
- if (!trigger_kthread) {
- /* Do it the slow way */
- tracepoint_synchronize_unregister();
- kfree(data);
- return;
- }
-
llist_add(&data->llist, &trigger_data_free_list);
wake_up_process(trigger_kthread);
}
+static int __init trigger_data_free_init(void)
+{
+ guard(mutex)(&trigger_data_kthread_mutex);
+
+ if (llist_empty(&trigger_data_free_list))
+ return 0;
+
+ trigger_create_kthread_locked();
+ if (trigger_kthread)
+ wake_up_process(trigger_kthread);
+ else
+ trigger_data_free_queued_locked();
+
+ return 0;
+}
+late_initcall(trigger_data_free_init);
+
static inline void data_ops_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3d8239fee004..0d2d3a2ea7dd 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -400,14 +400,19 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops,
struct ftrace_regs *fregs)
{
+ unsigned long *task_var = fgraph_get_task_var(gops);
struct fgraph_times *ftimes;
struct trace_array *tr;
+ unsigned int trace_ctx;
+ u64 calltime, rettime;
int size;
+ rettime = trace_clock_local();
+
ftrace_graph_addr_finish(gops, trace);
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
- trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+ if (*task_var & TRACE_GRAPH_NOTRACE) {
+ *task_var &= ~TRACE_GRAPH_NOTRACE;
return;
}
@@ -418,11 +423,13 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
tr = gops->private;
handle_nosleeptime(tr, trace, ftimes, size);
- if (tracing_thresh &&
- (trace_clock_local() - ftimes->calltime < tracing_thresh))
+ calltime = ftimes->calltime;
+
+ if (tracing_thresh && (rettime - calltime < tracing_thresh))
return;
- else
- trace_graph_return(trace, gops, fregs);
+
+ trace_ctx = tracing_gen_ctx();
+ __trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
}
static struct fgraph_ops funcgraph_ops = {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a5dbb72528e0..a8420e6abb56 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,8 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
static int __init set_kprobe_boot_events(char *str)
{
- strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+ trace_append_boot_param(kprobe_boot_events_buf, str, ';',
+ COMMAND_LINE_SIZE);
disable_tracing_selftest("running kprobe events");
return 1;
@@ -765,6 +766,14 @@ static unsigned int number_of_same_symbols(const char *mod, const char *func_nam
if (!mod)
kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+ /*
+ * If the symbol is found in vmlinux, use vmlinux resolution only.
+ * This prevents module symbols from shadowing vmlinux symbols
+ * and causing -EADDRNOTAVAIL for unqualified kprobe targets.
+ */
+ if (!mod && ctx.count > 0)
+ return ctx.count;
+
module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
return ctx.count;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index dee610e465b9..75678053b21c 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -58,6 +58,7 @@ enum osnoise_options_index {
OSN_PANIC_ON_STOP,
OSN_PREEMPT_DISABLE,
OSN_IRQ_DISABLE,
+ OSN_TIMERLAT_ALIGN,
OSN_MAX
};
@@ -66,7 +67,8 @@ static const char * const osnoise_options_str[OSN_MAX] = {
"OSNOISE_WORKLOAD",
"PANIC_ON_STOP",
"OSNOISE_PREEMPT_DISABLE",
- "OSNOISE_IRQ_DISABLE" };
+ "OSNOISE_IRQ_DISABLE",
+ "TIMERLAT_ALIGN" };
#define OSN_DEFAULT_OPTIONS 0x2
static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS;
@@ -251,6 +253,11 @@ struct timerlat_variables {
static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
/*
+ * timerlat wake-up offset for next thread with TIMERLAT_ALIGN set.
+ */
+static atomic64_t align_next;
+
+/*
* this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU
*/
static inline struct timerlat_variables *this_cpu_tmr_var(void)
@@ -268,6 +275,7 @@ static inline void tlat_var_reset(void)
/* Synchronize with the timerlat interfaces */
mutex_lock(&interface_lock);
+
/*
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
@@ -278,6 +286,12 @@ static inline void tlat_var_reset(void)
hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
}
+ /*
+ * Reset also align_next, to be filled by a new offset by the first timerlat
+ * thread that wakes up, if TIMERLAT_ALIGN is set.
+ */
+ atomic64_set(&align_next, 0);
+
mutex_unlock(&interface_lock);
}
#else /* CONFIG_TIMERLAT_TRACER */
@@ -326,6 +340,7 @@ static struct osnoise_data {
u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */
#ifdef CONFIG_TIMERLAT_TRACER
u64 timerlat_period; /* timerlat period */
+ u64 timerlat_align_us; /* timerlat alignment */
u64 print_stack; /* print IRQ stack if total > */
int timerlat_tracer; /* timerlat tracer */
#endif
@@ -338,6 +353,7 @@ static struct osnoise_data {
#ifdef CONFIG_TIMERLAT_TRACER
.print_stack = 0,
.timerlat_period = DEFAULT_TIMERLAT_PERIOD,
+ .timerlat_align_us = 0,
.timerlat_tracer = 0,
#endif
};
@@ -1830,6 +1846,26 @@ static int wait_next_period(struct timerlat_variables *tlat)
tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
/*
+ * Align thread in the first cycle on each CPU to the set alignment
+ * if TIMERLAT_ALIGN is set.
+ *
+ * This is done by using an atomic64_t to store the next absolute period.
+ * The first thread that wakes up will set the atomic64_t to its
+ * absolute period, and the other threads will increment it by
+ * the alignment value.
+ */
+ if (test_bit(OSN_TIMERLAT_ALIGN, &osnoise_options) && !tlat->count
+ && atomic64_cmpxchg_relaxed(&align_next, 0, tlat->abs_period)) {
+ /*
+ * A thread has already set align_next, use it and increment it
+ * to be used by the next thread that wakes up after this one.
+ */
+ tlat->abs_period = atomic64_add_return_relaxed(
+ osnoise_data.timerlat_align_us * 1000, &align_next);
+ next_abs_period = ns_to_ktime(tlat->abs_period);
+ }
+
+ /*
* If the new abs_period is in the past, skip the activation.
*/
while (ktime_compare(now, next_abs_period) > 0) {
@@ -2073,8 +2109,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
if (!osnoise_has_registered_instances())
return;
- guard(mutex)(&interface_lock);
guard(cpus_read_lock)();
+ guard(mutex)(&interface_lock);
if (!cpu_online(cpu))
return;
@@ -2237,11 +2273,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
if (running)
stop_per_cpu_kthreads();
- mutex_lock(&interface_lock);
/*
* avoid CPU hotplug operations that might read options.
*/
cpus_read_lock();
+ mutex_lock(&interface_lock);
retval = cnt;
@@ -2257,8 +2293,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
clear_bit(option, &osnoise_options);
}
- cpus_read_unlock();
mutex_unlock(&interface_lock);
+ cpus_read_unlock();
if (running)
start_per_cpu_kthreads();
@@ -2345,16 +2381,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
if (running)
stop_per_cpu_kthreads();
- mutex_lock(&interface_lock);
/*
* osnoise_cpumask is read by CPU hotplug operations.
*/
cpus_read_lock();
+ mutex_lock(&interface_lock);
cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
- cpus_read_unlock();
mutex_unlock(&interface_lock);
+ cpus_read_unlock();
if (running)
start_per_cpu_kthreads();
@@ -2650,6 +2686,17 @@ static struct trace_min_max_param timerlat_period = {
.min = &timerlat_min_period,
};
+/*
+ * osnoise/timerlat_align_us: align the first wakeup of all timerlat
+ * threads to a common boundary (in us). 0 means disabled.
+ */
+static struct trace_min_max_param timerlat_align_us = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.timerlat_align_us,
+ .max = NULL,
+ .min = NULL,
+};
+
static const struct file_operations timerlat_fd_fops = {
.open = timerlat_fd_open,
.read = timerlat_fd_read,
@@ -2746,6 +2793,11 @@ static int init_timerlat_tracefs(struct dentry *top_dir)
if (!tmp)
return -ENOMEM;
+ tmp = tracefs_create_file("timerlat_align_us", TRACE_MODE_WRITE, top_dir,
+ &timerlat_align_us, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
retval = osnoise_create_cpu_timerlat_fd(top_dir);
if (retval)
return retval;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1996d7aba038..a5ad76175d10 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
const char *
trace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long flags,
- const struct trace_print_flags *flag_array)
+ const struct trace_print_flags *flag_array,
+ size_t flag_array_size)
{
unsigned long mask;
const char *str;
const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
- for (i = 0; flag_array[i].name && flags; i++) {
+ for (i = 0; i < flag_array_size && flags; i++) {
mask = flag_array[i].mask;
if ((flags & mask) != mask)
@@ -106,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq);
const char *
trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
- const struct trace_print_flags *symbol_array)
+ const struct trace_print_flags *symbol_array,
+ size_t symbol_array_size)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
- for (i = 0; symbol_array[i].name; i++) {
+ for (i = 0; i < symbol_array_size; i++) {
if (val != symbol_array[i].mask)
continue;
@@ -133,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq);
const char *
trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
unsigned long long flags,
- const struct trace_print_flags_u64 *flag_array)
+ const struct trace_print_flags_u64 *flag_array,
+ size_t flag_array_size)
{
unsigned long long mask;
const char *str;
const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
- for (i = 0; flag_array[i].name && flags; i++) {
+ for (i = 0; i < flag_array_size && flags; i++) {
mask = flag_array[i].mask;
if ((flags & mask) != mask)
@@ -170,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64);
const char *
trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
- const struct trace_print_flags_u64 *symbol_array)
+ const struct trace_print_flags_u64 *symbol_array,
+ size_t symbol_array_size)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
- for (i = 0; symbol_array[i].name; i++) {
+ for (i = 0; i < symbol_array_size; i++) {
if (val != symbol_array[i].mask)
continue;
@@ -719,12 +723,13 @@ void print_function_args(struct trace_seq *s, unsigned long *args,
{
const struct btf_param *param;
const struct btf_type *t;
+ const struct btf_enum *enums;
const char *param_name;
char name[KSYM_NAME_LEN];
unsigned long arg;
struct btf *btf;
s32 tid, nr = 0;
- int a, p, x;
+ int a, p, x, i;
u16 encode;
trace_seq_printf(s, "(");
@@ -778,6 +783,15 @@ void print_function_args(struct trace_seq *s, unsigned long *args,
break;
case BTF_KIND_ENUM:
trace_seq_printf(s, "%ld", arg);
+ enums = btf_enum(t);
+ for (i = 0; i < btf_vlen(t); i++) {
+ if (arg == enums[i].val) {
+ trace_seq_printf(s, " [%s]",
+ btf_name_by_offset(btf,
+ enums[i].name_off));
+ break;
+ }
+ }
break;
default:
/* This does not handle complex arguments */
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 5ea5e0d76f00..3ea17af60169 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -197,6 +197,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
.notifier_call = module_trace_bprintk_format_notify,
};
+__printf(2, 3)
int __trace_bprintk(unsigned long ip, const char *fmt, ...)
{
int ret;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e0a5dc86c07e..e1c73065dae5 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1068,7 +1068,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
{
size_t len = strlen(str);
- if (str[len - 1] != '"') {
+ if (!len || str[len - 1] != '"') {
trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE);
return -EINVAL;
}
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
new file mode 100644
index 000000000000..d6c3f94d67cd
--- /dev/null
+++ b/kernel/trace/trace_remote.c
@@ -0,0 +1,1384 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/kstrtox.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <linux/tracefs.h>
+#include <linux/trace_remote.h>
+#include <linux/trace_seq.h>
+#include <linux/types.h>
+
+#include "trace.h"
+
+#define TRACEFS_DIR "remotes"
+#define TRACEFS_MODE_WRITE 0640
+#define TRACEFS_MODE_READ 0440
+
+enum tri_type {
+ TRI_CONSUMING,
+ TRI_NONCONSUMING,
+};
+
+struct trace_remote_iterator {
+ struct trace_remote *remote;
+ struct trace_seq seq;
+ struct delayed_work poll_work;
+ unsigned long lost_events;
+ u64 ts;
+ struct ring_buffer_iter *rb_iter;
+ struct ring_buffer_iter **rb_iters;
+ struct remote_event_hdr *evt;
+ int cpu;
+ int evt_cpu;
+ loff_t pos;
+ enum tri_type type;
+};
+
+struct trace_remote {
+ struct trace_remote_callbacks *cbs;
+ void *priv;
+ struct trace_buffer *trace_buffer;
+ struct trace_buffer_desc *trace_buffer_desc;
+ struct dentry *dentry;
+ struct eventfs_inode *eventfs;
+ struct remote_event *events;
+ unsigned long nr_events;
+ unsigned long trace_buffer_size;
+ struct ring_buffer_remote rb_remote;
+ struct mutex lock;
+ struct rw_semaphore reader_lock;
+ struct rw_semaphore *pcpu_reader_locks;
+ unsigned int nr_readers;
+ unsigned int poll_ms;
+ bool tracing_on;
+};
+
+static bool trace_remote_loaded(struct trace_remote *remote)
+{
+ return !!remote->trace_buffer;
+}
+
+static int trace_remote_load(struct trace_remote *remote)
+{
+ struct ring_buffer_remote *rb_remote = &remote->rb_remote;
+ struct trace_buffer_desc *desc;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (trace_remote_loaded(remote))
+ return 0;
+
+ desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv);
+ if (IS_ERR(desc))
+ return PTR_ERR(desc);
+
+ rb_remote->desc = desc;
+ rb_remote->swap_reader_page = remote->cbs->swap_reader_page;
+ rb_remote->priv = remote->priv;
+ rb_remote->reset = remote->cbs->reset;
+ remote->trace_buffer = ring_buffer_alloc_remote(rb_remote);
+ if (!remote->trace_buffer) {
+ remote->cbs->unload_trace_buffer(desc, remote->priv);
+ return -ENOMEM;
+ }
+
+ remote->trace_buffer_desc = desc;
+
+ return 0;
+}
+
+static void trace_remote_try_unload(struct trace_remote *remote)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ /* The buffer is being read or writable */
+ if (remote->nr_readers || remote->tracing_on)
+ return;
+
+ /* The buffer has readable data */
+ if (!ring_buffer_empty(remote->trace_buffer))
+ return;
+
+ ring_buffer_free(remote->trace_buffer);
+ remote->trace_buffer = NULL;
+ remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv);
+}
+
+static int trace_remote_enable_tracing(struct trace_remote *remote)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (remote->tracing_on)
+ return 0;
+
+ ret = trace_remote_load(remote);
+ if (ret)
+ return ret;
+
+ ret = remote->cbs->enable_tracing(true, remote->priv);
+ if (ret) {
+ trace_remote_try_unload(remote);
+ return ret;
+ }
+
+ remote->tracing_on = true;
+
+ return 0;
+}
+
+static int trace_remote_disable_tracing(struct trace_remote *remote)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (!remote->tracing_on)
+ return 0;
+
+ ret = remote->cbs->enable_tracing(false, remote->priv);
+ if (ret)
+ return ret;
+
+ ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+ remote->tracing_on = false;
+ trace_remote_try_unload(remote);
+
+ return 0;
+}
+
+static void trace_remote_reset(struct trace_remote *remote, int cpu)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ ring_buffer_reset(remote->trace_buffer);
+ else
+ ring_buffer_reset_cpu(remote->trace_buffer, cpu);
+
+ trace_remote_try_unload(remote);
+}
+
+static ssize_t
+tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote);
+ if (ret)
+ return ret;
+
+ return cnt;
+}
+static int tracing_on_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%d\n", remote->tracing_on);
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on);
+
+static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* KiB to Bytes */
+ if (!val || check_shl_overflow(val, 10, &val))
+ return -EINVAL;
+
+ guard(mutex)(&remote->lock);
+
+ if (trace_remote_loaded(remote))
+ return -EBUSY;
+
+ remote->trace_buffer_size = val;
+
+ return cnt;
+}
+
+static int buffer_size_kb_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10,
+ trace_remote_loaded(remote) ? "loaded" : "unloaded");
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb);
+
+static int trace_remote_get(struct trace_remote *remote, int cpu)
+{
+ int ret;
+
+ if (remote->nr_readers == UINT_MAX)
+ return -EBUSY;
+
+ ret = trace_remote_load(remote);
+ if (ret)
+ return ret;
+
+ if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) {
+ int lock_cpu;
+
+ remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks),
+ GFP_KERNEL);
+ if (!remote->pcpu_reader_locks) {
+ trace_remote_try_unload(remote);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(lock_cpu)
+ init_rwsem(&remote->pcpu_reader_locks[lock_cpu]);
+ }
+
+ remote->nr_readers++;
+
+ return 0;
+}
+
+static void trace_remote_put(struct trace_remote *remote)
+{
+ if (WARN_ON(!remote->nr_readers))
+ return;
+
+ remote->nr_readers--;
+ if (remote->nr_readers)
+ return;
+
+ kfree(remote->pcpu_reader_locks);
+ remote->pcpu_reader_locks = NULL;
+
+ trace_remote_try_unload(remote);
+}
+
+static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return true;
+
+ return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0;
+}
+
+static void __poll_remote(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct trace_remote_iterator *iter;
+
+ iter = container_of(dwork, struct trace_remote_iterator, poll_work);
+ ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu);
+ schedule_delayed_work((struct delayed_work *)work,
+ msecs_to_jiffies(iter->remote->poll_ms));
+}
+
+static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ ring_buffer_read_finish(iter->rb_iter);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (iter->rb_iters[cpu])
+ ring_buffer_read_finish(iter->rb_iters[cpu]);
+ }
+
+ kfree(iter->rb_iters);
+}
+
+static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL);
+
+ return iter->rb_iter ? 0 : -ENOMEM;
+ }
+
+ iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL);
+ if (!iter->rb_iters)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu,
+ GFP_KERNEL);
+ if (!iter->rb_iters[cpu]) {
+ /* This CPU isn't part of trace_buffer. Skip it */
+ if (!trace_remote_has_cpu(iter->remote, cpu))
+ continue;
+
+ __free_ring_buffer_iter(iter, RING_BUFFER_ALL_CPUS);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static struct trace_remote_iterator
+*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type)
+{
+ struct trace_remote_iterator *iter = NULL;
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote))
+ return NULL;
+
+ ret = trace_remote_get(remote, cpu);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!trace_remote_has_cpu(remote, cpu)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ iter = kzalloc_obj(*iter);
+ if (iter) {
+ iter->remote = remote;
+ iter->cpu = cpu;
+ iter->type = type;
+ trace_seq_init(&iter->seq);
+
+ switch (type) {
+ case TRI_CONSUMING:
+ ring_buffer_poll_remote(remote->trace_buffer, cpu);
+ INIT_DELAYED_WORK(&iter->poll_work, __poll_remote);
+ schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms));
+ break;
+ case TRI_NONCONSUMING:
+ ret = __alloc_ring_buffer_iter(iter, cpu);
+ break;
+ }
+
+ if (ret)
+ goto err;
+
+ return iter;
+ }
+ ret = -ENOMEM;
+
+err:
+ kfree(iter);
+ trace_remote_put(remote);
+
+ return ERR_PTR(ret);
+}
+
+static void trace_remote_iter_free(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote;
+
+ if (!iter)
+ return;
+
+ remote = iter->remote;
+
+ lockdep_assert_held(&remote->lock);
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ cancel_delayed_work_sync(&iter->poll_work);
+ break;
+ case TRI_NONCONSUMING:
+ __free_ring_buffer_iter(iter, iter->cpu);
+ break;
+ }
+
+ kfree(iter);
+ trace_remote_put(remote);
+}
+
+static void trace_remote_iter_read_start(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote = iter->remote;
+ int cpu = iter->cpu;
+
+ /* Acquire global reader lock */
+ if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ down_write(&remote->reader_lock);
+ else
+ down_read(&remote->reader_lock);
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return;
+
+ /*
+ * No need for the remote lock here, iter holds a reference on
+ * remote->nr_readers
+ */
+
+ /* Get the per-CPU one */
+ if (WARN_ON_ONCE(!remote->pcpu_reader_locks))
+ return;
+
+ if (iter->type == TRI_CONSUMING)
+ down_write(&remote->pcpu_reader_locks[cpu]);
+ else
+ down_read(&remote->pcpu_reader_locks[cpu]);
+}
+
+static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote = iter->remote;
+ int cpu = iter->cpu;
+
+ /* Release per-CPU reader lock */
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ /*
+ * No need for the remote lock here, iter holds a reference on
+ * remote->nr_readers
+ */
+ if (iter->type == TRI_CONSUMING)
+ up_write(&remote->pcpu_reader_locks[cpu]);
+ else
+ up_read(&remote->pcpu_reader_locks[cpu]);
+ }
+
+ /* Release global reader lock */
+ if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ up_write(&remote->reader_lock);
+ else
+ up_read(&remote->reader_lock);
+}
+
+static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu];
+}
+
+static struct ring_buffer_event *
+__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events)
+{
+ struct ring_buffer_event *rb_evt;
+ struct ring_buffer_iter *rb_iter;
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events);
+ case TRI_NONCONSUMING:
+ rb_iter = __get_rb_iter(iter, cpu);
+ if (!rb_iter)
+ return NULL;
+
+ rb_evt = ring_buffer_iter_peek(rb_iter, ts);
+ if (!rb_evt)
+ return NULL;
+
+ *lost_events = ring_buffer_iter_dropped(rb_iter);
+
+ return rb_evt;
+ }
+
+ return NULL;
+}
+
+static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter)
+{
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+ struct ring_buffer_event *rb_evt;
+ int cpu = iter->cpu;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (ring_buffer_empty_cpu(trace_buffer, cpu))
+ return false;
+
+ rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events);
+ if (!rb_evt)
+ return false;
+
+ iter->evt_cpu = cpu;
+ iter->evt = ring_buffer_event_data(rb_evt);
+ return true;
+ }
+
+ iter->ts = U64_MAX;
+ for_each_possible_cpu(cpu) {
+ unsigned long lost_events;
+ u64 ts;
+
+ if (ring_buffer_empty_cpu(trace_buffer, cpu))
+ continue;
+
+ rb_evt = __peek_event(iter, cpu, &ts, &lost_events);
+ if (!rb_evt)
+ continue;
+
+ if (ts >= iter->ts)
+ continue;
+
+ iter->ts = ts;
+ iter->evt_cpu = cpu;
+ iter->evt = ring_buffer_event_data(rb_evt);
+ iter->lost_events = lost_events;
+ }
+
+ return iter->ts != U64_MAX;
+}
+
+static void trace_remote_iter_move(struct trace_remote_iterator *iter)
+{
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL);
+ break;
+ case TRI_NONCONSUMING:
+ ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu));
+ break;
+ }
+}
+
+static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id);
+
+static int trace_remote_iter_print_event(struct trace_remote_iterator *iter)
+{
+ struct remote_event *evt;
+ unsigned long usecs_rem;
+ u64 ts = iter->ts;
+
+ if (iter->lost_events)
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->evt_cpu, iter->lost_events);
+
+ do_div(ts, 1000);
+ usecs_rem = do_div(ts, USEC_PER_SEC);
+
+ trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu,
+ ts, usecs_rem);
+
+ evt = trace_remote_find_event(iter->remote, iter->evt->id);
+ if (!evt)
+ trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id);
+ else
+ evt->print(iter->evt, &iter->seq);
+
+ return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0;
+}
+
+static int trace_pipe_open(struct inode *inode, struct file *filp)
+{
+ struct trace_remote *remote = inode->i_private;
+ struct trace_remote_iterator *iter;
+ int cpu = tracing_get_cpu(inode);
+
+ guard(mutex)(&remote->lock);
+
+ iter = trace_remote_iter(remote, cpu, TRI_CONSUMING);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ filp->private_data = iter;
+
+ return IS_ERR(iter) ? PTR_ERR(iter) : 0;
+}
+
+static int trace_pipe_release(struct inode *inode, struct file *filp)
+{
+ struct trace_remote_iterator *iter = filp->private_data;
+ struct trace_remote *remote = iter->remote;
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_iter_free(iter);
+
+ return 0;
+}
+
+static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_remote_iterator *iter = filp->private_data;
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+ int ret;
+
+copy_to_user:
+ ret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (ret != -EBUSY)
+ return ret;
+
+ trace_seq_init(&iter->seq);
+
+ ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ trace_remote_iter_read_start(iter);
+
+ while (trace_remote_iter_read_event(iter)) {
+ int prev_len = iter->seq.seq.len;
+
+ if (trace_remote_iter_print_event(iter)) {
+ iter->seq.seq.len = prev_len;
+ break;
+ }
+
+ trace_remote_iter_move(iter);
+ }
+
+ trace_remote_iter_read_finished(iter);
+
+ goto copy_to_user;
+}
+
+static const struct file_operations trace_pipe_fops = {
+ .open = trace_pipe_open,
+ .read = trace_pipe_read,
+ .release = trace_pipe_release,
+};
+
+static void *trace_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_remote_iterator *iter = m->private;
+
+ ++*pos;
+
+ if (!iter || !trace_remote_iter_read_event(iter))
+ return NULL;
+
+ trace_remote_iter_move(iter);
+ iter->pos++;
+
+ return iter;
+}
+
+static void *trace_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_remote_iterator *iter = m->private;
+ loff_t i;
+
+ if (!iter)
+ return NULL;
+
+ trace_remote_iter_read_start(iter);
+
+ if (!*pos) {
+ iter->pos = -1;
+ return trace_next(m, NULL, &i);
+ }
+
+ i = iter->pos;
+ while (i < *pos) {
+ iter = trace_next(m, NULL, &i);
+ if (!iter)
+ return NULL;
+ }
+
+ return iter;
+}
+
+static int trace_show(struct seq_file *m, void *v)
+{
+ struct trace_remote_iterator *iter = v;
+
+ trace_seq_init(&iter->seq);
+
+ if (trace_remote_iter_print_event(iter)) {
+ seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id);
+ return 0;
+ }
+
+ return trace_print_seq(m, &iter->seq);
+}
+
+static void trace_stop(struct seq_file *m, void *v)
+{
+ struct trace_remote_iterator *iter = m->private;
+
+ if (iter)
+ trace_remote_iter_read_finished(iter);
+}
+
+static const struct seq_operations trace_sops = {
+ .start = trace_start,
+ .next = trace_next,
+ .show = trace_show,
+ .stop = trace_stop,
+};
+
+static int trace_open(struct inode *inode, struct file *filp)
+{
+ struct trace_remote *remote = inode->i_private;
+ struct trace_remote_iterator *iter = NULL;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return 0;
+
+ guard(mutex)(&remote->lock);
+
+ iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ ret = seq_open(filp, &trace_sops);
+ if (ret) {
+ trace_remote_iter_free(iter);
+ return ret;
+ }
+
+ ((struct seq_file *)filp->private_data)->private = (void *)iter;
+
+ return 0;
+}
+
+static int trace_release(struct inode *inode, struct file *filp)
+{
+ struct trace_remote_iterator *iter;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return 0;
+
+ iter = ((struct seq_file *)filp->private_data)->private;
+ seq_release(inode, filp);
+
+ if (!iter)
+ return 0;
+
+ guard(mutex)(&iter->remote->lock);
+
+ trace_remote_iter_free(iter);
+
+ return 0;
+}
+
+static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_remote *remote = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_reset(remote, cpu);
+
+ return cnt;
+}
+
+static const struct file_operations trace_fops = {
+ .open = trace_open,
+ .write = trace_write,
+ .read = seq_read,
+ .read_iter = seq_read_iter,
+ .release = trace_release,
+};
+
+static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
+{
+ struct dentry *remote_d, *percpu_d, *d;
+ static struct dentry *root;
+ static DEFINE_MUTEX(lock);
+ bool root_inited = false;
+ int cpu;
+
+ guard(mutex)(&lock);
+
+ if (!root) {
+ root = tracefs_create_dir(TRACEFS_DIR, NULL);
+ if (!root) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n");
+ return -ENOMEM;
+ }
+ root_inited = true;
+ }
+
+ remote_d = tracefs_create_dir(name, root);
+ if (!remote_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name);
+ goto err;
+ }
+
+ d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
+ &buffer_size_kb_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops);
+ if (!d)
+ goto err;
+
+ percpu_d = tracefs_create_dir("per_cpu", remote_d);
+ if (!percpu_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name);
+ goto err;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *cpu_d;
+ char cpu_name[16];
+
+ snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu);
+ cpu_d = tracefs_create_dir(cpu_name, percpu_d);
+ if (!cpu_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n",
+ name, cpu);
+ goto err;
+ }
+
+ d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu,
+ &trace_pipe_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu,
+ &trace_fops);
+ if (!d)
+ goto err;
+ }
+
+ remote->dentry = remote_d;
+
+ return 0;
+
+err:
+ if (root_inited) {
+ tracefs_remove(root);
+ root = NULL;
+ } else {
+ tracefs_remove(remote_d);
+ }
+
+ return -ENOMEM;
+}
+
+static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *events, size_t nr_events);
+
+/**
+ * trace_remote_register() - Register a Tracefs remote
+ * @name: Name of the remote, used for the Tracefs remotes/ directory.
+ * @cbs: Set of callbacks used to control the remote.
+ * @priv: Private data, passed to each callback from @cbs.
+ * @events: Array of events. &remote_event.name and &remote_event.id must be
+ * filled by the caller.
+ * @nr_events: Number of events in the @events array.
+ *
+ * A trace remote is an entity, outside of the kernel (most likely firmware or
+ * hypervisor) capable of writing events into a Tracefs compatible ring-buffer.
+ * The kernel would then act as a reader.
+ *
+ * The registered remote will be found under the Tracefs directory
+ * remotes/<name>.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv,
+ struct remote_event *events, size_t nr_events)
+{
+ struct trace_remote *remote;
+ int ret;
+
+ remote = kzalloc_obj(*remote);
+ if (!remote)
+ return -ENOMEM;
+
+ remote->cbs = cbs;
+ remote->priv = priv;
+ remote->trace_buffer_size = 7 << 10;
+ remote->poll_ms = 100;
+ mutex_init(&remote->lock);
+ init_rwsem(&remote->reader_lock);
+
+ if (trace_remote_init_tracefs(name, remote)) {
+ kfree(remote);
+ return -ENOMEM;
+ }
+
+ ret = trace_remote_register_events(name, remote, events, nr_events);
+ if (ret) {
+ pr_err("Failed to register events for trace remote '%s' (%d)\n",
+ name, ret);
+ return ret;
+ }
+
+ ret = cbs->init ? cbs->init(remote->dentry, priv) : 0;
+ if (ret)
+ pr_err("Init failed for trace remote '%s' (%d)\n", name, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_remote_register);
+
+/**
+ * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer()
+ * @desc: Descriptor of the per-CPU ring-buffers, originally filled by
+ * trace_remote_alloc_buffer()
+ *
+ * Most likely called from &trace_remote_callbacks.unload_trace_buffer.
+ */
+void trace_remote_free_buffer(struct trace_buffer_desc *desc)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc) {
+ unsigned int id;
+
+ free_page(rb_desc->meta_va);
+
+ for (id = 0; id < rb_desc->nr_page_va; id++)
+ free_page(rb_desc->page_va[id]);
+ }
+}
+EXPORT_SYMBOL_GPL(trace_remote_free_buffer);
+
+/**
+ * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer
+ * @desc: Uninitialized trace_buffer_desc
+ * @desc_size: Size of the trace_buffer_desc. Must be at least equal to
+ * trace_buffer_desc_size()
+ * @buffer_size: Size in bytes of each per-CPU ring-buffer
+ * @cpumask: CPUs to allocate a ring-buffer for
+ *
+ * Helper to dynamically allocate a set of pages (enough to cover @buffer_size)
+ * for each CPU from @cpumask and fill @desc. Most likely called from
+ * &trace_remote_callbacks.load_trace_buffer.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size,
+ const struct cpumask *cpumask)
+{
+ unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1;
+ void *desc_end = desc + desc_size;
+ struct ring_buffer_desc *rb_desc;
+ int cpu, ret = -ENOMEM;
+
+ if (desc_size < struct_size(desc, __data, 0))
+ return -EINVAL;
+
+ desc->nr_cpus = 0;
+ desc->struct_len = struct_size(desc, __data, 0);
+
+ rb_desc = (struct ring_buffer_desc *)&desc->__data[0];
+
+ for_each_cpu(cpu, cpumask) {
+ unsigned int id;
+
+ if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ rb_desc->cpu = cpu;
+ rb_desc->nr_page_va = 0;
+ rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL);
+ if (!rb_desc->meta_va)
+ goto err;
+
+ for (id = 0; id < nr_pages; id++) {
+ rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL);
+ if (!rb_desc->page_va[id])
+ goto err;
+
+ rb_desc->nr_page_va++;
+ }
+ desc->nr_cpus++;
+ desc->struct_len += offsetof(struct ring_buffer_desc, page_va);
+ desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va);
+ rb_desc = __next_ring_buffer_desc(rb_desc);
+ }
+
+ return 0;
+
+err:
+ trace_remote_free_buffer(desc);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer);
+
+static int
+trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (evt->enabled == enable)
+ return 0;
+
+ ret = remote->cbs->enable_event(evt->id, enable, remote->priv);
+ if (ret)
+ return ret;
+
+ evt->enabled = enable;
+
+ return 0;
+}
+
+static int remote_event_enable_show(struct seq_file *s, void *unused)
+{
+ struct remote_event *evt = s->private;
+
+ seq_printf(s, "%d\n", evt->enabled);
+
+ return 0;
+}
+
+static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct remote_event *evt = seq->private;
+ struct trace_remote *remote = evt->remote;
+ u8 enable;
+ int ret;
+
+ ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = trace_remote_enable_event(remote, evt, enable);
+ if (ret)
+ return ret;
+
+ return count;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable);
+
+static int remote_event_id_show(struct seq_file *s, void *unused)
+{
+ struct remote_event *evt = s->private;
+
+ seq_printf(s, "%d\n", evt->id);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(remote_event_id);
+
+static int remote_event_format_show(struct seq_file *s, void *unused)
+{
+ size_t offset = sizeof(struct remote_event_hdr);
+ struct remote_event *evt = s->private;
+ struct trace_event_fields *field;
+
+ seq_printf(s, "name: %s\n", evt->name);
+ seq_printf(s, "ID: %d\n", evt->id);
+ seq_puts(s,
+ "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n");
+
+ field = &evt->fields[0];
+ while (field->name) {
+ seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n",
+ field->type, field->name, offset, field->size,
+ field->is_signed);
+ offset += field->size;
+ field++;
+ }
+
+ if (field != &evt->fields[0])
+ seq_puts(s, "\n");
+
+ seq_printf(s, "print fmt: %s\n", evt->print_fmt);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(remote_event_format);
+
+static int remote_event_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (!strcmp(name, "enable")) {
+ *mode = TRACEFS_MODE_WRITE;
+ *fops = &remote_event_enable_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "id")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_event_id_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "format")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_event_format_fops;
+ return 1;
+ }
+
+ return 0;
+}
+
+static ssize_t remote_events_dir_enable_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_remote *remote = file_inode(filp)->i_private;
+ int i, ret;
+ u8 enable;
+
+ ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ for (i = 0; i < remote->nr_events; i++) {
+ struct remote_event *evt = &remote->events[i];
+
+ trace_remote_enable_event(remote, evt, enable);
+ }
+
+ return count;
+}
+
+static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_remote *remote = file_inode(filp)->i_private;
+ const char enabled_char[] = {'0', '1', 'X'};
+ char enabled_str[] = " \n";
+ int i, enabled = -1;
+
+ guard(mutex)(&remote->lock);
+
+ for (i = 0; i < remote->nr_events; i++) {
+ struct remote_event *evt = &remote->events[i];
+
+ if (enabled == -1) {
+ enabled = evt->enabled;
+ } else if (enabled != evt->enabled) {
+ enabled = 2;
+ break;
+ }
+ }
+
+ enabled_str[0] = enabled_char[enabled == -1 ? 0 : enabled];
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, enabled_str, 2);
+}
+
+static const struct file_operations remote_events_dir_enable_fops = {
+ .write = remote_events_dir_enable_write,
+ .read = remote_events_dir_enable_read,
+};
+
+static ssize_t
+remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_page_header(NULL, s);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s));
+ kfree(s);
+
+ return ret;
+}
+
+static const struct file_operations remote_events_dir_header_page_fops = {
+ .read = remote_events_dir_header_page_read,
+};
+
+static ssize_t
+remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_entry_header(s);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s));
+ kfree(s);
+
+ return ret;
+}
+
+static const struct file_operations remote_events_dir_header_event_fops = {
+ .read = remote_events_dir_header_event_read,
+};
+
+static int remote_events_dir_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (!strcmp(name, "enable")) {
+ *mode = TRACEFS_MODE_WRITE;
+ *fops = &remote_events_dir_enable_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "header_page")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_events_dir_header_page_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "header_event")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_events_dir_header_event_fops;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *evt)
+{
+ struct eventfs_inode *eventfs = remote->eventfs;
+ static struct eventfs_entry dir_entries[] = {
+ {
+ .name = "enable",
+ .callback = remote_events_dir_callback,
+ }, {
+ .name = "header_page",
+ .callback = remote_events_dir_callback,
+ }, {
+ .name = "header_event",
+ .callback = remote_events_dir_callback,
+ }
+ };
+ static struct eventfs_entry entries[] = {
+ {
+ .name = "enable",
+ .callback = remote_event_callback,
+ }, {
+ .name = "id",
+ .callback = remote_event_callback,
+ }, {
+ .name = "format",
+ .callback = remote_event_callback,
+ }
+ };
+ bool eventfs_create = false;
+
+ if (!eventfs) {
+ eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries,
+ ARRAY_SIZE(dir_entries), remote);
+ if (IS_ERR(eventfs))
+ return PTR_ERR(eventfs);
+
+ /*
+ * Create similar hierarchy as local events even if a single system is supported at
+ * the moment
+ */
+ eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL);
+ if (IS_ERR(eventfs))
+ return PTR_ERR(eventfs);
+
+ remote->eventfs = eventfs;
+ eventfs_create = true;
+ }
+
+ eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt);
+ if (IS_ERR(eventfs)) {
+ if (eventfs_create) {
+ eventfs_remove_events_dir(remote->eventfs);
+ remote->eventfs = NULL;
+ }
+ return PTR_ERR(eventfs);
+ }
+
+ return 0;
+}
+
+static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events,
+ size_t nr_events)
+{
+ int i;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ if (evt->remote)
+ return -EEXIST;
+
+ evt->remote = remote;
+
+ /* We need events to be sorted for efficient lookup */
+ if (i && evt->id <= events[i - 1].id)
+ return -EINVAL;
+ }
+
+ remote->events = events;
+ remote->nr_events = nr_events;
+
+ return 0;
+}
+
+static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *events, size_t nr_events)
+{
+ int i, ret;
+
+ ret = trace_remote_attach_events(remote, events, nr_events);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ ret = trace_remote_init_eventfs(remote_name, remote, evt);
+ if (ret)
+ pr_warn("Failed to init eventfs for event '%s' (%d)",
+ evt->name, ret);
+ }
+
+ return 0;
+}
+
+static int __cmp_events(const void *key, const void *data)
+{
+ const struct remote_event *evt = data;
+ int id = (int)((long)key);
+
+ return id - (int)evt->id;
+}
+
+static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id)
+{
+ return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events,
+ sizeof(*remote->events), __cmp_events);
+}
diff --git a/kernel/trace/trace_snapshot.c b/kernel/trace/trace_snapshot.c
new file mode 100644
index 000000000000..07b43c9863a2
--- /dev/null
+++ b/kernel/trace/trace_snapshot.c
@@ -0,0 +1,1066 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fsnotify.h>
+
+#include <asm/setup.h> /* COMMAND_LINE_SIZE */
+
+#include "trace.h"
+
+/* Used if snapshot allocated at boot */
+static bool allocate_snapshot;
+static bool snapshot_at_boot;
+
+static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_snapshot_index;
+
+static int __init boot_alloc_snapshot(char *str)
+{
+ char *slot = boot_snapshot_info + boot_snapshot_index;
+ int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
+ int ret;
+
+ if (str[0] == '=') {
+ str++;
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_snapshot_index += ret;
+ } else {
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ trace_set_ring_buffer_expanded(NULL);
+ }
+ return 1;
+}
+__setup("alloc_snapshot", boot_alloc_snapshot);
+
+
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+static void tracing_snapshot_instance_cond(struct trace_array *tr,
+ void *cond_data)
+{
+ unsigned long flags;
+
+ if (in_nmi()) {
+ trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ trace_array_puts(tr, "*** snapshot is being ignored ***\n");
+ return;
+ }
+
+ if (!tr->allocated_snapshot) {
+ trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ trace_array_puts(tr, "*** stopping trace here! ***\n");
+ tracer_tracing_off(tr);
+ return;
+ }
+
+ if (tr->mapped) {
+ trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ /* Note, snapshot can not be used when the tracer uses it */
+ if (tracer_uses_snapshot(tr->current_trace)) {
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ update_max_tr(tr, current, smp_processor_id(), cond_data);
+ local_irq_restore(flags);
+}
+
+void tracing_snapshot_instance(struct trace_array *tr)
+{
+ tracing_snapshot_instance_cond(tr, NULL);
+}
+
+/**
+ * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
+ * @tr: The tracing instance to snapshot
+ * @cond_data: The data to be tested conditionally, and possibly saved
+ *
+ * This is the same as tracing_snapshot() except that the snapshot is
+ * conditional - the snapshot will only happen if the
+ * cond_snapshot.update() implementation receiving the cond_data
+ * returns true, which means that the trace array's cond_snapshot
+ * update() operation used the cond_data to determine whether the
+ * snapshot should be taken, and if it was, presumably saved it along
+ * with the snapshot.
+ */
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ tracing_snapshot_instance_cond(tr, cond_data);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
+
+/**
+ * tracing_cond_snapshot_data - get the user data associated with a snapshot
+ * @tr: The tracing instance
+ *
+ * When the user enables a conditional snapshot using
+ * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
+ * with the snapshot. This accessor is used to retrieve it.
+ *
+ * Should not be called from cond_snapshot.update(), since it takes
+ * the tr->max_lock lock, which the code calling
+ * cond_snapshot.update() has already done.
+ *
+ * Returns the cond_data associated with the trace array's snapshot.
+ */
+void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ void *cond_data = NULL;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+
+ if (tr->cond_snapshot)
+ cond_data = tr->cond_snapshot->cond_data;
+
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ return cond_data;
+}
+EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
+
+/* resize @tr's buffer to the size of @size_tr's entries */
+int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
+ struct array_buffer *size_buf, int cpu_id)
+{
+ int cpu, ret = 0;
+
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
+ if (ret < 0)
+ break;
+ per_cpu_ptr(trace_buf->data, cpu)->entries =
+ per_cpu_ptr(size_buf->data, cpu)->entries;
+ }
+ } else {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
+ if (ret == 0)
+ per_cpu_ptr(trace_buf->data, cpu_id)->entries =
+ per_cpu_ptr(size_buf->data, cpu_id)->entries;
+ }
+
+ return ret;
+}
+
+int tracing_alloc_snapshot_instance(struct trace_array *tr)
+{
+ int order;
+ int ret;
+
+ if (!tr->allocated_snapshot) {
+
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
+ /* allocate spare buffer */
+ ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
+ &tr->array_buffer, RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ return ret;
+
+ tr->allocated_snapshot = true;
+ }
+
+ return 0;
+}
+
+void free_snapshot(struct trace_array *tr)
+{
+ /*
+ * We don't free the ring buffer. instead, resize it because
+ * The max_tr ring buffer has some state (e.g. ring->clock) and
+ * we want preserve it.
+ */
+ ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0);
+ ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+ trace_set_buffer_entries(&tr->snapshot_buffer, 1);
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
+ tr->allocated_snapshot = false;
+}
+
+int tracing_arm_snapshot_locked(struct trace_array *tr)
+{
+ int ret;
+
+ lockdep_assert_held(&trace_types_lock);
+
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (tr->snapshot == UINT_MAX || tr->mapped) {
+ spin_unlock(&tr->snapshot_trigger_lock);
+ return -EBUSY;
+ }
+
+ tr->snapshot++;
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ ret = tracing_alloc_snapshot_instance(tr);
+ if (ret) {
+ spin_lock(&tr->snapshot_trigger_lock);
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+ }
+
+ return ret;
+}
+
+int tracing_arm_snapshot(struct trace_array *tr)
+{
+ guard(mutex)(&trace_types_lock);
+ return tracing_arm_snapshot_locked(tr);
+}
+
+void tracing_disarm_snapshot(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->snapshot))
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
+/**
+ * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to tracing_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+ int ret;
+
+ ret = tracing_alloc_snapshot();
+ if (ret < 0)
+ return;
+
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+
+/**
+ * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
+ * @tr: The tracing instance
+ * @cond_data: User data to associate with the snapshot
+ * @update: Implementation of the cond_snapshot update function
+ *
+ * Check whether the conditional snapshot for the given instance has
+ * already been enabled, or if the current tracer is already using a
+ * snapshot; if so, return -EBUSY, else create a cond_snapshot and
+ * save the cond_data and update function inside.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
+ cond_update_fn_t update)
+{
+ struct cond_snapshot *cond_snapshot __free(kfree) =
+ kzalloc_obj(*cond_snapshot);
+ int ret;
+
+ if (!cond_snapshot)
+ return -ENOMEM;
+
+ cond_snapshot->cond_data = cond_data;
+ cond_snapshot->update = update;
+
+ guard(mutex)(&trace_types_lock);
+
+ if (tracer_uses_snapshot(tr->current_trace))
+ return -EBUSY;
+
+ /*
+ * The cond_snapshot can only change to NULL without the
+ * trace_types_lock. We don't care if we race with it going
+ * to NULL, but we want to make sure that it's not set to
+ * something other than NULL when we get here, which we can
+ * do safely with only holding the trace_types_lock and not
+ * having to take the max_lock.
+ */
+ if (tr->cond_snapshot)
+ return -EBUSY;
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+ tr->cond_snapshot = no_free_ptr(cond_snapshot);
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
+
+/**
+ * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
+ * @tr: The tracing instance
+ *
+ * Check whether the conditional snapshot for the given instance is
+ * enabled; if so, free the cond_snapshot associated with it,
+ * otherwise return -EINVAL.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ int ret = 0;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+
+ if (!tr->cond_snapshot)
+ ret = -EINVAL;
+ else {
+ kfree(tr->cond_snapshot);
+ tr->cond_snapshot = NULL;
+ }
+
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef LATENCY_FS_NOTIFY
+static struct workqueue_struct *fsnotify_wq;
+
+static void latency_fsnotify_workfn(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array,
+ fsnotify_work);
+ fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
+}
+
+static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
+{
+ struct trace_array *tr = container_of(iwork, struct trace_array,
+ fsnotify_irqwork);
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+}
+
+__init static int latency_fsnotify_init(void)
+{
+ fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!fsnotify_wq) {
+ pr_err("Unable to allocate tr_max_lat_wq\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+late_initcall_sync(latency_fsnotify_init);
+
+void latency_fsnotify(struct trace_array *tr)
+{
+ if (!fsnotify_wq)
+ return;
+ /*
+ * We cannot call queue_work(&tr->fsnotify_work) from here because it's
+ * possible that we are called from __schedule() or do_idle(), which
+ * could cause a deadlock.
+ */
+ irq_work_queue(&tr->fsnotify_irqwork);
+}
+#endif /* LATENCY_FS_NOTIFY */
+
+static const struct file_operations tracing_max_lat_fops;
+
+void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+#ifdef LATENCY_FS_NOTIFY
+ INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
+ init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
+#endif
+ tr->d_max_latency = trace_create_file("tracing_max_latency",
+ TRACE_MODE_WRITE,
+ d_tracer, tr,
+ &tracing_max_lat_fops);
+}
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ struct array_buffer *trace_buf = &tr->array_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+ struct array_buffer *max_buf = &tr->snapshot_buffer;
+ struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
+
+ max_buf->cpu = cpu;
+ max_buf->time_start = data->preempt_timestamp;
+
+ max_data->saved_latency = tr->max_latency;
+ max_data->critical_start = data->critical_start;
+ max_data->critical_end = data->critical_end;
+
+ strscpy(max_data->comm, tsk->comm);
+ max_data->pid = tsk->pid;
+ /*
+ * If tsk == current, then use current_uid(), as that does not use
+ * RCU. The irq tracer can be called out of RCU scope.
+ */
+ if (tsk == current)
+ max_data->uid = current_uid();
+ else
+ max_data->uid = task_uid(tsk);
+
+ max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+ max_data->policy = tsk->policy;
+ max_data->rt_priority = tsk->rt_priority;
+
+ /* record this tasks comm */
+ tracing_record_cmdline(tsk);
+ latency_fsnotify(tr);
+}
+#else
+static inline void __update_max_tr(struct trace_array *tr,
+ struct task_struct *tsk, int cpu) { }
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ * @cond_data: User data associated with a conditional snapshot
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
+ void *cond_data)
+{
+ if (tr->stop_count)
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
+ return;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+
+ /* Inherit the recordable setting from array_buffer */
+ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
+ ring_buffer_record_on(tr->snapshot_buffer.buffer);
+ else
+ ring_buffer_record_off(tr->snapshot_buffer.buffer);
+
+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
+ arch_spin_unlock(&tr->max_lock);
+ return;
+ }
+
+ swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer);
+
+ __update_max_tr(tr, tsk, cpu);
+
+ arch_spin_unlock(&tr->max_lock);
+
+ /* Any waiters on the old snapshot buffer need to wake up */
+ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr: tracer
+ * @tsk: task with the latency
+ * @cpu: the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ int ret;
+
+ if (tr->stop_count)
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
+ return;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+
+ ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu);
+
+ if (ret == -EBUSY) {
+ /*
+ * We failed to swap the buffer due to a commit taking
+ * place on this CPU. We fail to record, but we reset
+ * the max trace buffer (no one writes directly to it)
+ * and flag that it failed.
+ * Another reason is resize is in progress.
+ */
+ trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_,
+ "Failed to swap buffers due to commit or resize in progress\n");
+ }
+
+ WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
+
+ __update_max_tr(tr, tsk, cpu);
+ arch_spin_unlock(&tr->max_lock);
+}
+
+static void show_snapshot_main_help(struct seq_file *m)
+{
+ seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
+ "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer.\n"
+ "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
+}
+
+static void show_snapshot_percpu_help(struct seq_file *m)
+{
+ seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer for this cpu.\n");
+#else
+ seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
+ "# Must use main snapshot file to allocate.\n");
+#endif
+ seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
+}
+
+void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ if (iter->tr->allocated_snapshot)
+ seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
+ else
+ seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
+
+ seq_puts(m, "# Snapshot commands:\n");
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ show_snapshot_main_help(m);
+ else
+ show_snapshot_percpu_help(m);
+}
+
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct trace_iterator *iter;
+ struct seq_file *m;
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file, true);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ } else {
+ /* Writes still need the seq_file to hold the private data */
+ ret = -ENOMEM;
+ m = kzalloc_obj(*m);
+ if (!m)
+ goto out;
+ iter = kzalloc_obj(*iter);
+ if (!iter) {
+ kfree(m);
+ goto out;
+ }
+ ret = 0;
+
+ iter->tr = tr;
+ iter->array_buffer = &tr->snapshot_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
+ m->private = iter;
+ file->private_data = m;
+ }
+out:
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+static void tracing_swap_cpu_buffer(void *tr)
+{
+ update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
+}
+
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+ unsigned long val;
+ int ret;
+
+ ret = tracing_update_buffers(tr);
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&trace_types_lock);
+
+ if (tracer_uses_snapshot(tr->current_trace))
+ return -EBUSY;
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+ if (tr->cond_snapshot)
+ ret = -EBUSY;
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+ if (ret)
+ return ret;
+
+ switch (val) {
+ case 0:
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
+ if (tr->allocated_snapshot)
+ free_snapshot(tr);
+ break;
+ case 1:
+/* Only allow per-cpu swap if the ring buffer supports it */
+#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
+#endif
+ if (tr->allocated_snapshot)
+ ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
+ &tr->array_buffer, iter->cpu_file);
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
+
+ /* Now, we're going to swap */
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
+ local_irq_disable();
+ update_max_tr(tr, current, smp_processor_id(), NULL);
+ local_irq_enable();
+ } else {
+ smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
+ (void *)tr, 1);
+ }
+ tracing_disarm_snapshot(tr);
+ break;
+ default:
+ if (tr->allocated_snapshot) {
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
+ else
+ tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file);
+ }
+ break;
+ }
+
+ if (ret >= 0) {
+ *ppos += cnt;
+ ret = cnt;
+ }
+
+ return ret;
+}
+
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ int ret;
+
+ ret = tracing_release(inode, file);
+
+ if (file->f_mode & FMODE_READ)
+ return ret;
+
+ /* If write only, the seq_file is just a stub */
+ if (m)
+ kfree(m->private);
+ kfree(m);
+
+ return 0;
+}
+
+static int snapshot_raw_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_buffer_info *info;
+ int ret;
+
+ /* The following checks for tracefs lockdown */
+ ret = tracing_buffers_open(inode, filp);
+ if (ret < 0)
+ return ret;
+
+ info = filp->private_data;
+
+ if (tracer_uses_snapshot(info->iter.trace)) {
+ tracing_buffers_release(inode, filp);
+ return -EBUSY;
+ }
+
+ info->iter.snapshot = true;
+ info->iter.array_buffer = &info->iter.tr->snapshot_buffer;
+
+ return ret;
+}
+
+const struct file_operations snapshot_fops = {
+ .open = tracing_snapshot_open,
+ .read = seq_read,
+ .write = tracing_snapshot_write,
+ .llseek = tracing_lseek,
+ .release = tracing_snapshot_release,
+};
+
+const struct file_operations snapshot_raw_fops = {
+ .open = snapshot_raw_open,
+ .read = tracing_buffers_read,
+ .release = tracing_buffers_release,
+ .splice_read = tracing_buffers_splice_read,
+};
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
+}
+
+static const struct file_operations tracing_max_lat_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_max_lat_read,
+ .write = tracing_max_lat_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+int get_snapshot_map(struct trace_array *tr)
+{
+ int err = 0;
+
+ /*
+ * Called with mmap_lock held. lockdep would be unhappy if we would now
+ * take trace_types_lock. Instead use the specific
+ * snapshot_trigger_lock.
+ */
+ spin_lock(&tr->snapshot_trigger_lock);
+
+ if (tr->snapshot || tr->mapped == UINT_MAX)
+ err = -EBUSY;
+ else
+ tr->mapped++;
+
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ /* Wait for update_max_tr() to observe iter->tr->mapped */
+ if (tr->mapped == 1)
+ synchronize_rcu();
+
+ return err;
+
+}
+
+void put_snapshot_map(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->mapped))
+ tr->mapped--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ tracing_snapshot_instance(tr);
+}
+
+static void
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
+
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count) {
+
+ if (*count <= 0)
+ return;
+
+ (*count)--;
+ }
+
+ tracing_snapshot_instance(tr);
+}
+
+static int
+ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_puts(m, "snapshot");
+
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count)
+ seq_printf(m, ":count=%ld\n", *count);
+ else
+ seq_puts(m, ":unlimited\n");
+
+ return 0;
+}
+
+static int
+ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *init_data, void **data)
+{
+ struct ftrace_func_mapper *mapper = *data;
+
+ if (!mapper) {
+ mapper = allocate_ftrace_func_mapper();
+ if (!mapper)
+ return -ENOMEM;
+ *data = mapper;
+ }
+
+ return ftrace_func_mapper_add_ip(mapper, ip, init_data);
+}
+
+static void
+ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+
+ if (!ip) {
+ if (!mapper)
+ return;
+ free_ftrace_func_mapper(mapper, NULL);
+ return;
+ }
+
+ ftrace_func_mapper_remove_ip(mapper, ip);
+}
+
+static struct ftrace_probe_ops snapshot_probe_ops = {
+ .func = ftrace_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static struct ftrace_probe_ops snapshot_count_probe_ops = {
+ .func = ftrace_count_snapshot,
+ .print = ftrace_snapshot_print,
+ .init = ftrace_snapshot_init,
+ .free = ftrace_snapshot_free,
+};
+
+static int
+ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+ void *count = (void *)-1;
+ char *number;
+ int ret;
+
+ if (!tr)
+ return -ENODEV;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enable)
+ return -EINVAL;
+
+ ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
+
+ if (glob[0] == '!') {
+ ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (!ret)
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+ }
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ goto out_reg;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, (unsigned long *)&count);
+ if (ret)
+ return ret;
+
+ out_reg:
+ ret = tracing_arm_snapshot(tr);
+ if (ret < 0)
+ return ret;
+
+ ret = register_ftrace_function_probe(glob, tr, ops, count);
+ if (ret < 0)
+ tracing_disarm_snapshot(tr);
+
+ return ret < 0 ? ret : 0;
+}
+
+static struct ftrace_func_command ftrace_snapshot_cmd = {
+ .name = "snapshot",
+ .func = ftrace_trace_snapshot_callback,
+};
+
+__init int register_snapshot_cmd(void)
+{
+ return register_ftrace_command(&ftrace_snapshot_cmd);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+int trace_allocate_snapshot(struct trace_array *tr, int size)
+{
+ int ret;
+
+ /* Fix mapped buffer trace arrays do not have snapshot buffers */
+ if (tr->range_addr_start)
+ return 0;
+
+ /* allocate_snapshot can only be true during system boot */
+ ret = allocate_trace_buffer(tr, &tr->snapshot_buffer,
+ allocate_snapshot ? size : 1);
+ if (ret < 0)
+ return -ENOMEM;
+
+ tr->allocated_snapshot = allocate_snapshot;
+
+ allocate_snapshot = false;
+ return 0;
+}
+
+__init static bool tr_needs_alloc_snapshot(const char *name)
+{
+ char *test;
+ int len = strlen(name);
+ bool ret;
+
+ if (!boot_snapshot_index)
+ return false;
+
+ if (strncmp(name, boot_snapshot_info, len) == 0 &&
+ boot_snapshot_info[len] == '\t')
+ return true;
+
+ test = kmalloc(strlen(name) + 3, GFP_KERNEL);
+ if (!test)
+ return false;
+
+ sprintf(test, "\t%s\t", name);
+ ret = strstr(boot_snapshot_info, test) == NULL;
+ kfree(test);
+ return ret;
+}
+
+__init void do_allocate_snapshot(const char *name)
+{
+ if (!tr_needs_alloc_snapshot(name))
+ return;
+
+ /*
+ * When allocate_snapshot is set, the next call to
+ * allocate_trace_buffers() (called by trace_array_get_by_name())
+ * will allocate the snapshot buffer. That will also clear
+ * this flag.
+ */
+ allocate_snapshot = true;
+}
+
+void __init ftrace_boot_snapshot(void)
+{
+ struct trace_array *tr;
+
+ if (!snapshot_at_boot)
+ return;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->allocated_snapshot)
+ continue;
+
+ tracing_snapshot_instance(tr);
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
+ }
+}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 37317b81fcda..8ad72e17d8eb 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -174,7 +174,6 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat
{ O_NOFOLLOW, "O_NOFOLLOW" },
{ O_NOATIME, "O_NOATIME" },
{ O_CLOEXEC, "O_CLOEXEC" },
- { -1, NULL }
};
trace_seq_printf(s, "%s(", entry->name);
@@ -205,7 +204,7 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat
trace_seq_puts(s, "O_RDONLY|");
}
- trace_print_flags_seq(s, "|", bits, __flags);
+ trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags));
/*
* trace_print_flags_seq() adds a '\0' to the
* buffer, but this needs to append more to the seq.
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 91905aa19294..dffef52a807b 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -300,6 +300,8 @@ static int tracepoint_add_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
+ if (tp->ext && tp->ext->unregfunc && !static_key_enabled(&tp->key))
+ tp->ext->unregfunc();
WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 8d82913223a1..8614430ca212 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -18,8 +18,6 @@
#include <asm/page.h>
#include <asm/sections.h>
-#include <crypto/sha1.h>
-
#include "kallsyms_internal.h"
#include "kexec_internal.h"
@@ -198,7 +196,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_head);
+ VMCOREINFO_OFFSET(page, compound_info);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLATMEM
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7d675781bc91..87dd5e0f6968 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -61,6 +61,13 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
# endif /* CONFIG_SMP */
/*
+ * Number of consecutive missed interrupts before declaring a lockup.
+ * Default to 1 (immediate) for NMI/Perf. Buddy will overwrite this to 3.
+ */
+int __read_mostly watchdog_hardlockup_miss_thresh = 1;
+EXPORT_SYMBOL_GPL(watchdog_hardlockup_miss_thresh);
+
+/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
unsigned int __read_mostly hardlockup_panic =
@@ -137,6 +144,7 @@ __setup("nmi_watchdog=", hardlockup_panic_setup);
static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(int, hrtimer_interrupts_missed);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
static unsigned long hard_lockup_nmi_warn;
@@ -159,21 +167,33 @@ void watchdog_hardlockup_touch_cpu(unsigned int cpu)
per_cpu(watchdog_hardlockup_touched, cpu) = true;
}
-static bool is_hardlockup(unsigned int cpu)
+static void watchdog_hardlockup_update_reset(unsigned int cpu)
{
int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
- if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
- return true;
-
/*
* NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
* for hrtimer_interrupts_saved. hrtimer_interrupts_saved is
* written/read by a single CPU.
*/
per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ per_cpu(hrtimer_interrupts_missed, cpu) = 0;
+}
+
+static bool is_hardlockup(unsigned int cpu)
+{
+ int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
- return false;
+ if (per_cpu(hrtimer_interrupts_saved, cpu) != hrint) {
+ watchdog_hardlockup_update_reset(cpu);
+ return false;
+ }
+
+ per_cpu(hrtimer_interrupts_missed, cpu)++;
+ if (per_cpu(hrtimer_interrupts_missed, cpu) % watchdog_hardlockup_miss_thresh)
+ return false;
+
+ return true;
}
static void watchdog_hardlockup_kick(void)
@@ -187,8 +207,11 @@ static void watchdog_hardlockup_kick(void)
void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
{
int hardlockup_all_cpu_backtrace;
+ unsigned int this_cpu;
+ unsigned long flags;
if (per_cpu(watchdog_hardlockup_touched, cpu)) {
+ watchdog_hardlockup_update_reset(cpu);
per_cpu(watchdog_hardlockup_touched, cpu) = false;
return;
}
@@ -201,74 +224,73 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
* fired multiple times before we overflow'd. If it hasn't
* then this is a good indication the cpu is stuck
*/
- if (is_hardlockup(cpu)) {
- unsigned int this_cpu = smp_processor_id();
- unsigned long flags;
+ if (!is_hardlockup(cpu)) {
+ per_cpu(watchdog_hardlockup_warned, cpu) = false;
+ return;
+ }
#ifdef CONFIG_SYSFS
- ++hardlockup_count;
+ ++hardlockup_count;
#endif
- /*
- * A poorly behaving BPF scheduler can trigger hard lockup by
- * e.g. putting numerous affinitized tasks in a single queue and
- * directing all CPUs at it. The following call can return true
- * only once when sched_ext is enabled and will immediately
- * abort the BPF scheduler and print out a warning message.
- */
- if (scx_hardlockup(cpu))
- return;
+ /*
+ * A poorly behaving BPF scheduler can trigger hard lockup by
+ * e.g. putting numerous affinitized tasks in a single queue and
+ * directing all CPUs at it. The following call can return true
+ * only once when sched_ext is enabled and will immediately
+ * abort the BPF scheduler and print out a warning message.
+ */
+ if (scx_hardlockup(cpu))
+ return;
- /* Only print hardlockups once. */
- if (per_cpu(watchdog_hardlockup_warned, cpu))
- return;
+ /* Only print hardlockups once. */
+ if (per_cpu(watchdog_hardlockup_warned, cpu))
+ return;
- /*
- * Prevent multiple hard-lockup reports if one cpu is already
- * engaged in dumping all cpu back traces.
- */
- if (hardlockup_all_cpu_backtrace) {
- if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
- return;
- }
+ /*
+ * Prevent multiple hard-lockup reports if one cpu is already
+ * engaged in dumping all cpu back traces.
+ */
+ if (hardlockup_all_cpu_backtrace) {
+ if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
+ return;
+ }
- /*
- * NOTE: we call printk_cpu_sync_get_irqsave() after printing
- * the lockup message. While it would be nice to serialize
- * that printout, we really want to make sure that if some
- * other CPU somehow locked up while holding the lock associated
- * with printk_cpu_sync_get_irqsave() that we can still at least
- * get the message about the lockup out.
- */
- pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu);
- printk_cpu_sync_get_irqsave(flags);
+ /*
+ * NOTE: we call printk_cpu_sync_get_irqsave() after printing
+ * the lockup message. While it would be nice to serialize
+ * that printout, we really want to make sure that if some
+ * other CPU somehow locked up while holding the lock associated
+ * with printk_cpu_sync_get_irqsave() that we can still at least
+ * get the message about the lockup out.
+ */
+ this_cpu = smp_processor_id();
+ pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu);
+ printk_cpu_sync_get_irqsave(flags);
- print_modules();
- print_irqtrace_events(current);
- if (cpu == this_cpu) {
- if (regs)
- show_regs(regs);
- else
- dump_stack();
- printk_cpu_sync_put_irqrestore(flags);
- } else {
- printk_cpu_sync_put_irqrestore(flags);
- trigger_single_cpu_backtrace(cpu);
- }
+ print_modules();
+ print_irqtrace_events(current);
+ if (cpu == this_cpu) {
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+ printk_cpu_sync_put_irqrestore(flags);
+ } else {
+ printk_cpu_sync_put_irqrestore(flags);
+ trigger_single_cpu_backtrace(cpu);
+ }
- if (hardlockup_all_cpu_backtrace) {
- trigger_allbutcpu_cpu_backtrace(cpu);
- if (!hardlockup_panic)
- clear_bit_unlock(0, &hard_lockup_nmi_warn);
- }
+ if (hardlockup_all_cpu_backtrace) {
+ trigger_allbutcpu_cpu_backtrace(cpu);
+ if (!hardlockup_panic)
+ clear_bit_unlock(0, &hard_lockup_nmi_warn);
+ }
- sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT);
- if (hardlockup_panic)
- nmi_panic(regs, "Hard LOCKUP");
+ sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT);
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
- per_cpu(watchdog_hardlockup_warned, cpu) = true;
- } else {
- per_cpu(watchdog_hardlockup_warned, cpu) = false;
- }
+ per_cpu(watchdog_hardlockup_warned, cpu) = true;
}
#else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c
index ee754d767c21..3a1e57080c1c 100644
--- a/kernel/watchdog_buddy.c
+++ b/kernel/watchdog_buddy.c
@@ -21,6 +21,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu)
int __init watchdog_hardlockup_probe(void)
{
+ watchdog_hardlockup_miss_thresh = 3;
return 0;
}
@@ -86,14 +87,6 @@ void watchdog_buddy_check_hardlockup(int hrtimer_interrupts)
{
unsigned int next_cpu;
- /*
- * Test for hardlockups every 3 samples. The sample period is
- * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
- * watchdog_thresh (over by 20%).
- */
- if (hrtimer_interrupts % 3 != 0)
- return;
-
/* check for a hardlockup on the next CPU */
next_cpu = watchdog_next_cpu(smp_processor_id());
if (next_cpu >= nr_cpu_ids)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aeaec79bc09c..5f747f241a5f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,6 +41,7 @@
#include <linux/mempolicy.h>
#include <linux/freezer.h>
#include <linux/debug_locks.h>
+#include <linux/device/devres.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
#include <linux/jhash.h>
@@ -130,6 +131,14 @@ enum wq_internal_consts {
WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
};
+/* Layout of shards within one LLC pod */
+struct llc_shard_layout {
+ int nr_large_shards; /* number of large shards (cores_per_shard + 1) */
+ int cores_per_shard; /* base number of cores per default shard */
+ int nr_shards; /* total number of shards */
+ /* nr_default shards = (nr_shards - nr_large_shards) */
+};
+
/*
* We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
* MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
@@ -190,7 +199,7 @@ struct worker_pool {
int id; /* I: pool ID */
unsigned int flags; /* L: flags */
- unsigned long watchdog_ts; /* L: watchdog timestamp */
+ unsigned long last_progress_ts; /* L: last forward progress timestamp */
bool cpu_stall; /* WD: stalled cpu bound pool */
/*
@@ -404,11 +413,12 @@ struct work_offq_data {
u32 flags;
};
-static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
+static const char * const wq_affn_names[WQ_AFFN_NR_TYPES] = {
[WQ_AFFN_DFL] = "default",
[WQ_AFFN_CPU] = "cpu",
[WQ_AFFN_SMT] = "smt",
[WQ_AFFN_CACHE] = "cache",
+ [WQ_AFFN_CACHE_SHARD] = "cache_shard",
[WQ_AFFN_NUMA] = "numa",
[WQ_AFFN_SYSTEM] = "system",
};
@@ -431,13 +441,16 @@ module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);
+static unsigned int wq_cache_shard_size = 8;
+module_param_named(cache_shard_size, wq_cache_shard_size, uint, 0444);
+
static bool wq_online; /* can kworkers be created yet? */
static bool wq_topo_initialized __read_mostly = false;
static struct kmem_cache *pwq_cache;
static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
-static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE_SHARD;
/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;
@@ -530,6 +543,8 @@ struct workqueue_struct *system_bh_wq;
EXPORT_SYMBOL_GPL(system_bh_wq);
struct workqueue_struct *system_bh_highpri_wq;
EXPORT_SYMBOL_GPL(system_bh_highpri_wq);
+struct workqueue_struct *system_dfl_long_wq __ro_after_init;
+EXPORT_SYMBOL_GPL(system_dfl_long_wq);
static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
@@ -1697,7 +1712,7 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
trace_workqueue_activate_work(work);
if (list_empty(&pwq->pool->worklist))
- pwq->pool->watchdog_ts = jiffies;
+ pwq->pool->last_progress_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}
@@ -1849,8 +1864,20 @@ static void unplug_oldest_pwq(struct workqueue_struct *wq)
raw_spin_lock_irq(&pwq->pool->lock);
if (pwq->plugged) {
pwq->plugged = false;
- if (pwq_activate_first_inactive(pwq, true))
+ if (pwq_activate_first_inactive(pwq, true)) {
+ /*
+ * While plugged, queueing skips activation which
+ * includes bumping the nr_active count and adding the
+ * pwq to nna->pending_pwqs if the count can't be
+ * obtained. We need to restore both for the pwq being
+ * unplugged. The first call activates the first
+ * inactive work item and the second, if there are more
+ * inactive, puts the pwq on pending_pwqs.
+ */
+ pwq_activate_first_inactive(pwq, false);
+
kick_pool(pwq->pool);
+ }
}
raw_spin_unlock_irq(&pwq->pool->lock);
}
@@ -2348,7 +2375,7 @@ retry:
*/
if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
if (list_empty(&pool->worklist))
- pool->watchdog_ts = jiffies;
+ pool->last_progress_ts = jiffies;
trace_workqueue_activate_work(work);
insert_work(pwq, work, &pool->worklist, work_flags);
@@ -2507,7 +2534,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
- WARN_ON_ONCE(!wq);
WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -3204,6 +3230,7 @@ __acquires(&pool->lock)
worker->current_pwq = pwq;
if (worker->task)
worker->current_at = worker->task->se.sum_exec_runtime;
+ worker->current_start = jiffies;
work_data = *work_data_bits(work);
worker->current_color = get_work_color(work_data);
@@ -3352,7 +3379,7 @@ static void process_scheduled_works(struct worker *worker)
while ((work = list_first_entry_or_null(&worker->scheduled,
struct work_struct, entry))) {
if (first) {
- worker->pool->watchdog_ts = jiffies;
+ worker->pool->last_progress_ts = jiffies;
first = false;
}
process_one_work(worker, work);
@@ -4850,7 +4877,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
- pool->watchdog_ts = jiffies;
+ pool->last_progress_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
@@ -5622,8 +5649,16 @@ enomem:
for_each_possible_cpu(cpu) {
struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
- if (pwq)
+ if (pwq) {
+ /*
+ * Unlink pwq from wq->pwqs since link_pwq()
+ * may have already added it. wq->mutex is not
+ * needed as the wq has not been published yet.
+ */
+ if (!list_empty(&pwq->pwqs_node))
+ list_del_rcu(&pwq->pwqs_node);
kmem_cache_free(pwq_cache, pwq);
+ }
}
free_percpu(wq->cpu_pwq);
wq->cpu_pwq = NULL;
@@ -5891,6 +5926,33 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
}
EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
+static void devm_workqueue_release(void *res)
+{
+ destroy_workqueue(res);
+}
+
+__printf(2, 5) struct workqueue_struct *
+devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags,
+ int max_active, ...)
+{
+ struct workqueue_struct *wq;
+ va_list args;
+ int ret;
+
+ va_start(args, max_active);
+ wq = alloc_workqueue(fmt, flags, max_active, args);
+ va_end(args);
+ if (!wq)
+ return NULL;
+
+ ret = devm_add_action_or_reset(dev, devm_workqueue_release, wq);
+ if (ret)
+ return NULL;
+
+ return wq;
+}
+EXPORT_SYMBOL_GPL(devm_alloc_workqueue);
+
#ifdef CONFIG_LOCKDEP
__printf(1, 5)
struct workqueue_struct *
@@ -6274,7 +6336,7 @@ static void pr_cont_worker_id(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- if (pool->flags & WQ_BH)
+ if (pool->flags & POOL_BH)
pr_cont("bh%s",
pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
else
@@ -6359,6 +6421,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_cont(" %s", comma ? "," : "");
pr_cont_worker_id(worker);
pr_cont(":%ps", worker->current_func);
+ pr_cont(" for %us",
+ jiffies_to_msecs(jiffies - worker->current_start) / 1000);
list_for_each_entry(work, &worker->scheduled, entry)
pr_cont_work(false, work, &pcws);
pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
@@ -6462,7 +6526,7 @@ static void show_one_worker_pool(struct worker_pool *pool)
/* How long the first pending work is waiting for a worker. */
if (!list_empty(&pool->worklist))
- hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
+ hung = jiffies_to_msecs(jiffies - pool->last_progress_ts) / 1000;
/*
* Defer printing to avoid deadlocks in console drivers that
@@ -7044,7 +7108,7 @@ int workqueue_unbound_housekeeping_update(const struct cpumask *hk)
/*
* If the operation fails, it will fall back to
* wq_requested_unbound_cpumask which is initially set to
- * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+ * HK_TYPE_DOMAIN house keeping mask and rewritten
* by any subsequent write to workqueue/cpumask sysfs file.
*/
if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk))
@@ -7063,13 +7127,7 @@ int workqueue_unbound_housekeeping_update(const struct cpumask *hk)
static int parse_affn_scope(const char *val)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
- if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
- return i;
- }
- return -EINVAL;
+ return sysfs_match_string(wq_affn_names, val);
}
static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
@@ -7176,7 +7234,26 @@ static struct attribute *wq_sysfs_attrs[] = {
&dev_attr_max_active.attr,
NULL,
};
-ATTRIBUTE_GROUPS(wq_sysfs);
+
+static umode_t wq_sysfs_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ /*
+ * Adjusting max_active breaks ordering guarantee. Changing it has no
+ * effect on BH worker. Limit max_active to RO in such case.
+ */
+ if (wq->flags & (WQ_BH | __WQ_ORDERED))
+ return 0444;
+ return a->mode;
+}
+
+static const struct attribute_group wq_sysfs_group = {
+ .is_visible = wq_sysfs_is_visible,
+ .attrs = wq_sysfs_attrs,
+};
+__ATTRIBUTE_GROUPS(wq_sysfs);
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
@@ -7479,13 +7556,6 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
struct wq_device *wq_dev;
int ret;
- /*
- * Adjusting max_active breaks ordering guarantee. Disallow exposing
- * ordered workqueues.
- */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
- return -EINVAL;
-
wq->wq_dev = wq_dev = kzalloc_obj(*wq_dev);
if (!wq_dev)
return -ENOMEM;
@@ -7580,11 +7650,11 @@ MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds
/*
* Show workers that might prevent the processing of pending work items.
- * The only candidates are CPU-bound workers in the running state.
- * Pending work items should be handled by another idle worker
- * in all other situations.
+ * A busy worker that is not running on the CPU (e.g. sleeping in
+ * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as
+ * effectively as a CPU-bound one, so dump every in-flight worker.
*/
-static void show_cpu_pool_hog(struct worker_pool *pool)
+static void show_cpu_pool_busy_workers(struct worker_pool *pool)
{
struct worker *worker;
unsigned long irq_flags;
@@ -7593,36 +7663,34 @@ static void show_cpu_pool_hog(struct worker_pool *pool)
raw_spin_lock_irqsave(&pool->lock, irq_flags);
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
- if (task_is_running(worker->task)) {
- /*
- * Defer printing to avoid deadlocks in console
- * drivers that queue work while holding locks
- * also taken in their write paths.
- */
- printk_deferred_enter();
+ /*
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
+ */
+ printk_deferred_enter();
- pr_info("pool %d:\n", pool->id);
- sched_show_task(worker->task);
+ pr_info("pool %d:\n", pool->id);
+ sched_show_task(worker->task);
- printk_deferred_exit();
- }
+ printk_deferred_exit();
}
raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}
-static void show_cpu_pools_hogs(void)
+static void show_cpu_pools_busy_workers(void)
{
struct worker_pool *pool;
int pi;
- pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
+ pr_info("Showing backtraces of busy workers in stalled worker pools:\n");
rcu_read_lock();
for_each_pool(pool, pi) {
if (pool->cpu_stall)
- show_cpu_pool_hog(pool);
+ show_cpu_pool_busy_workers(pool);
}
@@ -7691,15 +7759,36 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
else
touched = READ_ONCE(wq_watchdog_touched);
- pool_ts = READ_ONCE(pool->watchdog_ts);
+ pool_ts = READ_ONCE(pool->last_progress_ts);
if (time_after(pool_ts, touched))
ts = pool_ts;
else
ts = touched;
- /* did we stall? */
+ /*
+ * Did we stall?
+ *
+ * Do a lockless check first to do not disturb the system.
+ *
+ * Prevent false positives by double checking the timestamp
+ * under pool->lock. The lock makes sure that the check reads
+ * an updated pool->last_progress_ts when this CPU saw
+ * an already updated pool->worklist above. It seems better
+ * than adding another barrier into __queue_work() which
+ * is a hotter path.
+ */
if (time_after(now, ts + thresh)) {
+ scoped_guard(raw_spinlock_irqsave, &pool->lock) {
+ pool_ts = pool->last_progress_ts;
+ if (time_after(pool_ts, touched))
+ ts = pool_ts;
+ else
+ ts = touched;
+ }
+ if (!time_after(now, ts + thresh))
+ continue;
+
lockup_detected = true;
stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
max_stall_time = max(max_stall_time, stall_time);
@@ -7711,15 +7800,13 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
pr_cont_pool_info(pool);
pr_cont(" stuck for %us!\n", stall_time);
}
-
-
}
if (lockup_detected)
show_all_workqueues();
if (cpu_pool_stall)
- show_cpu_pools_hogs();
+ show_cpu_pools_busy_workers();
if (lockup_detected)
panic_on_wq_watchdog(max_stall_time);
@@ -7845,8 +7932,8 @@ void __init workqueue_init_early(void)
{
struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
- void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
- bh_pool_kick_highpri };
+ void (*irq_work_fns[NR_STD_WORKER_POOLS])(struct irq_work *) =
+ { bh_pool_kick_normal, bh_pool_kick_highpri };
int i, cpu;
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
@@ -7858,7 +7945,6 @@ void __init workqueue_init_early(void)
cpumask_copy(wq_online_cpumask, cpu_online_mask);
cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
- restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
if (!cpumask_empty(&wq_cmdline_cpumask))
restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
@@ -7942,11 +8028,12 @@ void __init workqueue_init_early(void)
system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0);
system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0);
+ system_dfl_long_wq = alloc_workqueue("events_dfl_long", WQ_UNBOUND, WQ_MAX_ACTIVE);
BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq ||
- !system_bh_wq || !system_bh_highpri_wq);
+ !system_bh_wq || !system_bh_highpri_wq || !system_dfl_long_wq);
}
static void __init wq_cpu_intensive_thresh_init(void)
@@ -8112,6 +8199,186 @@ static bool __init cpus_share_numa(int cpu0, int cpu1)
return cpu_to_node(cpu0) == cpu_to_node(cpu1);
}
+/* Maps each CPU to its shard index within the LLC pod it belongs to */
+static int cpu_shard_id[NR_CPUS] __initdata;
+
+/**
+ * llc_count_cores - count distinct cores (SMT groups) within an LLC pod
+ * @pod_cpus: the cpumask of CPUs in the LLC pod
+ * @smt_pods: the SMT pod type, used to identify sibling groups
+ *
+ * A core is represented by the lowest-numbered CPU in its SMT group. Returns
+ * the number of distinct cores found in @pod_cpus.
+ */
+static int __init llc_count_cores(const struct cpumask *pod_cpus,
+ struct wq_pod_type *smt_pods)
+{
+ const struct cpumask *sibling_cpus;
+ int nr_cores = 0, c;
+
+ /*
+ * Count distinct cores by only counting the first CPU in each
+ * SMT sibling group.
+ */
+ for_each_cpu(c, pod_cpus) {
+ sibling_cpus = smt_pods->pod_cpus[smt_pods->cpu_pod[c]];
+ if (cpumask_first(sibling_cpus) == c)
+ nr_cores++;
+ }
+
+ return nr_cores;
+}
+
+/*
+ * llc_shard_size - number of cores in a given shard
+ *
+ * Cores are spread as evenly as possible. The first @nr_large_shards shards are
+ * "large shards" with (cores_per_shard + 1) cores; the rest are "default
+ * shards" with cores_per_shard cores.
+ */
+static int __init llc_shard_size(int shard_id, int cores_per_shard, int nr_large_shards)
+{
+ /* The first @nr_large_shards shards are large shards */
+ if (shard_id < nr_large_shards)
+ return cores_per_shard + 1;
+
+ /* The remaining shards are default shards */
+ return cores_per_shard;
+}
+
+/*
+ * llc_calc_shard_layout - compute the shard layout for an LLC pod
+ * @nr_cores: number of distinct cores in the LLC pod
+ *
+ * Chooses the number of shards that keeps average shard size closest to
+ * wq_cache_shard_size. Returns a struct describing the total number of shards,
+ * the base size of each, and how many are large shards.
+ */
+static struct llc_shard_layout __init llc_calc_shard_layout(int nr_cores)
+{
+ struct llc_shard_layout layout;
+
+ /* Ensure at least one shard; pick the count closest to the target size */
+ layout.nr_shards = max(1, DIV_ROUND_CLOSEST(nr_cores, wq_cache_shard_size));
+ layout.cores_per_shard = nr_cores / layout.nr_shards;
+ layout.nr_large_shards = nr_cores % layout.nr_shards;
+
+ return layout;
+}
+
+/*
+ * llc_shard_is_full - check whether a shard has reached its core capacity
+ * @cores_in_shard: number of cores already assigned to this shard
+ * @shard_id: index of the shard being checked
+ * @layout: the shard layout computed by llc_calc_shard_layout()
+ *
+ * Returns true if @cores_in_shard equals the expected size for @shard_id.
+ */
+static bool __init llc_shard_is_full(int cores_in_shard, int shard_id,
+ const struct llc_shard_layout *layout)
+{
+ return cores_in_shard == llc_shard_size(shard_id, layout->cores_per_shard,
+ layout->nr_large_shards);
+}
+
+/**
+ * llc_populate_cpu_shard_id - populate cpu_shard_id[] for each CPU in an LLC pod
+ * @pod_cpus: the cpumask of CPUs in the LLC pod
+ * @smt_pods: the SMT pod type, used to identify sibling groups
+ * @nr_cores: number of distinct cores in @pod_cpus (from llc_count_cores())
+ *
+ * Walks @pod_cpus in order. At each SMT group leader, advances to the next
+ * shard once the current shard is full. Results are written to cpu_shard_id[].
+ */
+static void __init llc_populate_cpu_shard_id(const struct cpumask *pod_cpus,
+ struct wq_pod_type *smt_pods,
+ int nr_cores)
+{
+ struct llc_shard_layout layout = llc_calc_shard_layout(nr_cores);
+ const struct cpumask *sibling_cpus;
+ /* Count the number of cores in the current shard_id */
+ int cores_in_shard = 0;
+ unsigned int leader;
+ /* This is a cursor for the shards. Go from zero to nr_shards - 1*/
+ int shard_id = 0;
+ int c;
+
+ /* Iterate at every CPU for a given LLC pod, and assign it a shard */
+ for_each_cpu(c, pod_cpus) {
+ sibling_cpus = smt_pods->pod_cpus[smt_pods->cpu_pod[c]];
+ if (cpumask_first(sibling_cpus) == c) {
+ /* This is the CPU leader for the siblings */
+ if (llc_shard_is_full(cores_in_shard, shard_id, &layout)) {
+ shard_id++;
+ cores_in_shard = 0;
+ }
+ cores_in_shard++;
+ cpu_shard_id[c] = shard_id;
+ } else {
+ /*
+ * The siblings' shard MUST be the same as the leader.
+ * never split threads in the same core.
+ */
+ leader = cpumask_first(sibling_cpus);
+
+ /*
+ * This check silences a Warray-bounds warning on UP
+ * configs where NR_CPUS=1 makes cpu_shard_id[]
+ * a single-element array, and the compiler can't
+ * prove the index is always 0.
+ */
+ if (WARN_ON_ONCE(leader >= nr_cpu_ids))
+ continue;
+ cpu_shard_id[c] = cpu_shard_id[leader];
+ }
+ }
+
+ WARN_ON_ONCE(shard_id != (layout.nr_shards - 1));
+}
+
+/**
+ * precompute_cache_shard_ids - assign each CPU its shard index within its LLC
+ *
+ * Iterates over all LLC pods. For each pod, counts distinct cores then assigns
+ * shard indices to all CPUs in the pod. Must be called after WQ_AFFN_CACHE and
+ * WQ_AFFN_SMT have been initialized.
+ */
+static void __init precompute_cache_shard_ids(void)
+{
+ struct wq_pod_type *llc_pods = &wq_pod_types[WQ_AFFN_CACHE];
+ struct wq_pod_type *smt_pods = &wq_pod_types[WQ_AFFN_SMT];
+ const struct cpumask *cpus_sharing_llc;
+ int nr_cores;
+ int pod;
+
+ if (!wq_cache_shard_size) {
+ pr_warn("workqueue: cache_shard_size must be > 0, setting to 1\n");
+ wq_cache_shard_size = 1;
+ }
+
+ for (pod = 0; pod < llc_pods->nr_pods; pod++) {
+ cpus_sharing_llc = llc_pods->pod_cpus[pod];
+
+ /* Number of cores in this given LLC */
+ nr_cores = llc_count_cores(cpus_sharing_llc, smt_pods);
+ llc_populate_cpu_shard_id(cpus_sharing_llc, smt_pods, nr_cores);
+ }
+}
+
+/*
+ * cpus_share_cache_shard - test whether two CPUs belong to the same cache shard
+ *
+ * Two CPUs share a cache shard if they are in the same LLC and have the same
+ * shard index. Used as the pod affinity callback for WQ_AFFN_CACHE_SHARD.
+ */
+static bool __init cpus_share_cache_shard(int cpu0, int cpu1)
+{
+ if (!cpus_share_cache(cpu0, cpu1))
+ return false;
+
+ return cpu_shard_id[cpu0] == cpu_shard_id[cpu1];
+}
+
/**
* workqueue_init_topology - initialize CPU pods for unbound workqueues
*
@@ -8127,6 +8394,8 @@ void __init workqueue_init_topology(void)
init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
+ precompute_cache_shard_ids();
+ init_pod_type(&wq_pod_types[WQ_AFFN_CACHE_SHARD], cpus_share_cache_shard);
init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
wq_topo_initialized = true;
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index f6275944ada7..8def1ddc5a1b 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -32,6 +32,7 @@ struct worker {
work_func_t current_func; /* K: function */
struct pool_workqueue *current_pwq; /* K: pwq */
u64 current_at; /* K: runtime at start or last wakeup */
+ unsigned long current_start; /* K: start time of current work item */
unsigned int current_color; /* K: color */
int sleeping; /* S: is worker sleeping? */