summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c6
-rw-r--r--kernel/audit.h19
-rw-r--r--kernel/audit_fsnotify.c34
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c25
-rw-r--r--kernel/auditfilter.c17
-rw-r--r--kernel/auditsc.c4
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arena.c317
-rw-r--r--kernel/bpf/arraymap.c16
-rw-r--r--kernel/bpf/backtrack.c82
-rw-r--r--kernel/bpf/bpf_lru_list.c165
-rw-r--r--kernel/bpf/bpf_lru_list.h25
-rw-r--r--kernel/bpf/bpf_lsm.c20
-rw-r--r--kernel/bpf/bpf_struct_ops.c63
-rw-r--r--kernel/bpf/btf.c318
-rw-r--r--kernel/bpf/cfg.c13
-rw-r--r--kernel/bpf/cgroup.c65
-rw-r--r--kernel/bpf/cnum.c120
-rw-r--r--kernel/bpf/cnum_defs.h247
-rw-r--r--kernel/bpf/const_fold.c8
-rw-r--r--kernel/bpf/core.c54
-rw-r--r--kernel/bpf/devmap.c19
-rw-r--r--kernel/bpf/fixups.c40
-rw-r--r--kernel/bpf/hashtab.c840
-rw-r--r--kernel/bpf/helpers.c209
-rw-r--r--kernel/bpf/inode.c260
-rw-r--r--kernel/bpf/liveness.c208
-rw-r--r--kernel/bpf/log.c132
-rw-r--r--kernel/bpf/lpm_trie.c8
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_iter.c7
-rw-r--r--kernel/bpf/stackmap.c215
-rw-r--r--kernel/bpf/states.c67
-rw-r--r--kernel/bpf/syscall.c338
-rw-r--r--kernel/bpf/trampoline.c671
-rw-r--r--kernel/bpf/verifier.c4173
-rw-r--r--kernel/cgroup/cgroup.c416
-rw-r--r--kernel/cgroup/cpuset-internal.h1
-rw-r--r--kernel/cgroup/cpuset-v1.c2
-rw-r--r--kernel/cgroup/cpuset.c75
-rw-r--r--kernel/cgroup/dmem.c1
-rw-r--r--kernel/cgroup/rdma.c315
-rw-r--r--kernel/cgroup/rstat.c37
-rw-r--r--kernel/configs/hardening.config2
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/dma/debug.c11
-rw-r--r--kernel/dma/direct.c6
-rw-r--r--kernel/dma/mapping.c16
-rw-r--r--kernel/entry/common.c9
-rw-r--r--kernel/events/core.c108
-rw-r--r--kernel/events/internal.h1
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exec_state.c119
-rw-r--r--kernel/exit.c34
-rw-r--r--kernel/fork.c59
-rw-r--r--kernel/futex/core.c460
-rw-r--r--kernel/futex/futex.h52
-rw-r--r--kernel/futex/pi.c62
-rw-r--r--kernel/futex/requeue.c26
-rw-r--r--kernel/futex/syscalls.c36
-rw-r--r--kernel/futex/waitwake.c47
-rw-r--r--kernel/irq/chip.c17
-rw-r--r--kernel/irq/debugfs.h44
-rw-r--r--kernel/irq/internals.h66
-rw-r--r--kernel/irq/irqdesc.c70
-rw-r--r--kernel/irq/irqdomain.c5
-rw-r--r--kernel/irq/manage.c45
-rw-r--r--kernel/irq/proc.c234
-rw-r--r--kernel/irq/proc.h13
-rw-r--r--kernel/irq/settings.h13
-rw-r--r--kernel/irq_work.c7
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/liveupdate/kexec_handover.c58
-rw-r--r--kernel/locking/mutex.c65
-rw-r--r--kernel/locking/percpu-rwsem.c29
-rw-r--r--kernel/locking/rtmutex.c8
-rw-r--r--kernel/locking/rtmutex_api.c33
-rw-r--r--kernel/locking/rwbase_rt.c6
-rw-r--r--kernel/locking/rwsem.c10
-rw-r--r--kernel/locking/semaphore.c4
-rw-r--r--kernel/locking/ww_mutex.h4
-rw-r--r--kernel/module/decompress.c2
-rw-r--r--kernel/panic.c11
-rw-r--r--kernel/params.c8
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/hibernate.c46
-rw-r--r--kernel/power/qos.c11
-rw-r--r--kernel/power/swap.c13
-rw-r--r--kernel/ptrace.c57
-rw-r--r--kernel/rcu/rcutorture.c29
-rw-r--r--kernel/rcu/srcutree.c12
-rw-r--r--kernel/rcu/tasks.h3
-rw-r--r--kernel/rcu/tree.c65
-rw-r--r--kernel/rcu/tree_nocb.h2
-rw-r--r--kernel/rcu/tree_stall.h7
-rw-r--r--kernel/rseq.c214
-rw-r--r--kernel/sched/build_policy.c9
-rw-r--r--kernel/sched/core.c445
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/cputime.c314
-rw-r--r--kernel/sched/deadline.c275
-rw-r--r--kernel/sched/debug.c166
-rw-r--r--kernel/sched/ext.c1675
-rw-r--r--kernel/sched/ext_arena.c131
-rw-r--r--kernel/sched/ext_arena.h18
-rw-r--r--kernel/sched/ext_cid.c707
-rw-r--r--kernel/sched/ext_cid.h271
-rw-r--r--kernel/sched/ext_idle.c41
-rw-r--r--kernel/sched/ext_internal.h280
-rw-r--r--kernel/sched/ext_types.h144
-rw-r--r--kernel/sched/fair.c2259
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c15
-rw-r--r--kernel/sched/membarrier.c109
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h135
-rw-r--r--kernel/sched/stats.h9
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/topology.c526
-rw-r--r--kernel/signal.c1
-rw-r--r--kernel/stop_machine.c5
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/alarmtimer.c72
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c29
-rw-r--r--kernel/time/hrtimer.c152
-rw-r--r--kernel/time/jiffies.c11
-rw-r--r--kernel/time/namespace.c2
-rw-r--r--kernel/time/posix-cpu-timers.c19
-rw-r--r--kernel/time/posix-timers.c35
-rw-r--r--kernel/time/posix-timers.h4
-rw-r--r--kernel/time/tick-sched.c215
-rw-r--r--kernel/time/tick-sched.h12
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/timekeeping.c235
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/time/timer_list.c6
-rw-r--r--kernel/time/timer_migration.c289
-rw-r--r--kernel/time/timer_migration.h36
-rw-r--r--kernel/torture.c16
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/bpf_trace.c337
-rw-r--r--kernel/trace/fprobe.c23
-rw-r--r--kernel/trace/ftrace.c35
-rw-r--r--kernel/trace/remote_test.c4
-rw-r--r--kernel/trace/ring_buffer.c37
-rw-r--r--kernel/trace/rv/monitors/deadline/deadline.h3
-rw-r--r--kernel/trace/rv/monitors/nomiss/nomiss.c4
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c12
-rw-r--r--kernel/trace/rv/monitors/stall/stall.c4
-rw-r--r--kernel/trace/simple_ring_buffer.c4
-rw-r--r--kernel/trace/trace_events_hist.c6
-rw-r--r--kernel/trace/trace_osnoise.c46
-rw-r--r--kernel/trace/trace_probe.c183
-rw-r--r--kernel/trace/trace_probe.h7
-rw-r--r--kernel/trace/trace_syscalls.c110
-rw-r--r--kernel/trace/trace_uprobe.c5
-rw-r--r--kernel/trace/tracing_map.c17
-rw-r--r--kernel/umh.c2
-rw-r--r--kernel/workqueue.c120
166 files changed, 15548 insertions, 6069 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6785982013dc..1e1a31673577 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
# Makefile for the linux kernel.
#
-obj-y = fork.o exec_domain.o panic.o \
+obj-y = fork.o exec_domain.o exec_state.o panic.o \
cpu.o exit.o softirq.o resource.o \
sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
diff --git a/kernel/acct.c b/kernel/acct.c
index cbbf79d718cf..c440d43479ca 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -249,7 +249,7 @@ static int acct_on(const char __user *name)
return -EINVAL;
/* Exclude procfs and sysfs. */
- if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE)
+ if (file_inode(file)->i_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED)
return -EINVAL;
if (!(file->f_mode & FMODE_CAN_WRITE))
diff --git a/kernel/audit.c b/kernel/audit.c
index e1d489bc2dff..dcc657d35776 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1468,6 +1468,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
+ if (audit_enabled == AUDIT_LOCKED)
+ return -EPERM;
audit_trim_trees();
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
@@ -1480,6 +1482,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
size_t msglen = data_len;
char *old, *new;
+ if (audit_enabled == AUDIT_LOCKED)
+ return -EPERM;
err = -EINVAL;
if (msglen < 2 * sizeof(u32))
break;
@@ -2030,7 +2034,7 @@ void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args)
* here and AUDIT_BUFSIZ is at least 1024, then we can
* log everything that printk could have logged. */
avail = audit_expand(ab,
- max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+ max_t(unsigned int, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
goto out_va_end;
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
diff --git a/kernel/audit.h b/kernel/audit.h
index ac81fa02bcd7..92d5e723d570 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -233,7 +233,7 @@ static inline int audit_hash_ino(u64 ino)
/* Indicates that audit should log the full pathname. */
#define AUDIT_NAME_FULL -1
-extern int audit_match_class(int class, unsigned syscall);
+extern int audit_match_class(int class, unsigned int syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
@@ -256,8 +256,13 @@ extern int audit_del_rule(struct audit_entry *entry);
extern void audit_free_rule_rcu(struct rcu_head *head);
extern struct list_head audit_filter_list[];
-extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+struct audit_watch_ctx {
+ struct inode *dir;
+ struct inode *child;
+};
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+ struct audit_watch_ctx *ctx);
extern void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm);
@@ -280,13 +285,15 @@ extern char *audit_watch_path(struct audit_watch *watch);
extern int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev);
extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
- char *pathname, int len);
+ char *pathname, int len,
+ struct audit_watch_ctx *ctx);
extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
extern void audit_remove_mark_rule(struct audit_krule *krule);
extern int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino,
dev_t dev);
-extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old,
+ struct audit_watch_ctx *ctx);
extern int audit_exe_compare(struct task_struct *tsk,
struct audit_fsnotify_mark *mark);
@@ -317,13 +324,13 @@ extern struct list_head *audit_killed_trees(void);
#define audit_watch_path(w) ""
#define audit_watch_compare(w, i, d) 0
-#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
+#define audit_alloc_mark(k, p, l, c) (ERR_PTR(-EINVAL))
#define audit_mark_path(m) ""
#define audit_remove_mark(m) do { } while (0)
#define audit_remove_mark_rule(k) do { } while (0)
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
-#define audit_dupe_exe(n, o) (-EINVAL)
+#define audit_dupe_exe(n, o, c) (-EINVAL)
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 711454f9f724..fa33d57e4320 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -71,22 +71,29 @@ static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
}
-struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
+struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname,
+ int len, struct audit_watch_ctx *ctx)
{
struct audit_fsnotify_mark *audit_mark;
struct path path;
struct dentry *dentry;
- int ret;
+ struct inode *dir, *child;
+ int ret, allow_dups;
if (pathname[0] != '/' || pathname[len-1] == '/')
return ERR_PTR(-EINVAL);
- dentry = kern_path_parent(pathname, &path);
- if (IS_ERR(dentry))
- return ERR_CAST(dentry); /* returning an error */
- if (d_really_is_negative(dentry)) {
- audit_mark = ERR_PTR(-ENOENT);
- goto out;
+ if (!ctx) {
+ dentry = kern_path_parent(pathname, &path);
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry); /* returning an error */
+ dir = d_inode(path.dentry);
+ child = d_inode(dentry);
+ allow_dups = 0;
+ } else {
+ dir = ctx->dir;
+ child = ctx->child;
+ allow_dups = 1;
}
audit_mark = kzalloc_obj(*audit_mark);
@@ -98,18 +105,21 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group);
audit_mark->mark.mask = AUDIT_FS_EVENTS;
audit_mark->path = pathname;
- audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
- ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0);
+ audit_update_mark(audit_mark, child);
+ ret = fsnotify_add_inode_mark(&audit_mark->mark, dir, allow_dups);
+
if (ret < 0) {
audit_mark->path = NULL;
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
}
out:
- dput(dentry);
- path_put(&path);
+ if (!ctx) {
+ dput(dentry);
+ path_put(&path);
+ }
return audit_mark;
}
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ee84777fdfad..1ed19b775912 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -33,7 +33,7 @@ struct audit_chunk {
struct audit_node {
struct list_head list;
struct audit_tree *owner;
- unsigned index; /* index; upper bit indicates 'will prune' */
+ unsigned int index; /* index; upper bit indicates 'will prune' */
} owners[] __counted_by(count);
};
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 33577f0f54ef..06dd0ebe73e2 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -244,7 +244,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
/* Update inode info in audit rules based on filesystem event. */
static void audit_update_watch(struct audit_parent *parent,
const struct qstr *dname, dev_t dev,
- u64 ino, unsigned invalidating)
+ u64 ino, unsigned int invalidating,
+ struct audit_watch_ctx *ctx)
{
struct audit_watch *owatch, *nwatch, *nextw;
struct audit_krule *r, *nextr;
@@ -280,7 +281,7 @@ static void audit_update_watch(struct audit_parent *parent,
list_del(&oentry->rule.rlist);
list_del_rcu(&oentry->list);
- nentry = audit_dupe_rule(&oentry->rule);
+ nentry = audit_dupe_rule(&oentry->rule, ctx);
if (IS_ERR(nentry)) {
list_del(&oentry->rule.list);
audit_panic("error updating watch, removing");
@@ -479,10 +480,17 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
return 0;
- if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
- audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
- else if (mask & (FS_DELETE|FS_MOVED_FROM))
- audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
+ if (mask & (FS_CREATE|FS_MOVED_TO) && inode) {
+ struct audit_watch_ctx ctx = { .dir = dir, .child = inode };
+
+ audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0,
+ &ctx);
+ } else if (mask & (FS_DELETE|FS_MOVED_FROM)) {
+ struct audit_watch_ctx ctx = { .dir = dir, .child = NULL };
+
+ audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1,
+ &ctx);
+ }
else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
@@ -505,7 +513,8 @@ static int __init audit_watch_init(void)
}
device_initcall(audit_watch_init);
-int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old,
+ struct audit_watch_ctx *ctx)
{
struct audit_fsnotify_mark *audit_mark;
char *pathname;
@@ -514,7 +523,7 @@ int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
if (!pathname)
return -ENOMEM;
- audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+ audit_mark = audit_alloc_mark(new, pathname, strlen(pathname), ctx);
if (IS_ERR(audit_mark)) {
kfree(pathname);
return PTR_ERR(audit_mark);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 093425123f6c..4401119b5275 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -165,13 +165,13 @@ static inline int audit_to_inode(struct audit_krule *krule,
static __u32 *classes[AUDIT_SYSCALL_CLASSES];
-int __init audit_register_class(int class, unsigned *list)
+int __init audit_register_class(int class, unsigned int *list)
{
__u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
if (!p)
return -ENOMEM;
while (*list != ~0U) {
- unsigned n = *list++;
+ unsigned int n = *list++;
if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
kfree(p);
return -EINVAL;
@@ -186,7 +186,7 @@ int __init audit_register_class(int class, unsigned *list)
return 0;
}
-int audit_match_class(int class, unsigned syscall)
+int audit_match_class(int class, unsigned int syscall)
{
if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
return 0;
@@ -237,7 +237,7 @@ static int audit_match_signal(struct audit_entry *entry)
/* Common user-space to kernel rule translation. */
static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
- unsigned listnr;
+ unsigned int listnr;
struct audit_entry *entry;
int i, err;
@@ -589,7 +589,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
err = PTR_ERR(str);
goto exit_free;
}
- audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
+ audit_mark = audit_alloc_mark(&entry->rule, str, f_val, NULL);
if (IS_ERR(audit_mark)) {
kfree(str);
err = PTR_ERR(audit_mark);
@@ -816,7 +816,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
* rule with the new rule in the filterlist, then free the old rule.
* The rlist element is undefined; list manipulations are handled apart from
* the initial copy. */
-struct audit_entry *audit_dupe_rule(struct audit_krule *old)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+ struct audit_watch_ctx *ctx)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
@@ -875,7 +876,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
new->filterkey = fk;
break;
case AUDIT_EXE:
- err = audit_dupe_exe(new, old);
+ err = audit_dupe_exe(new, old, ctx);
break;
}
if (err) {
@@ -1414,7 +1415,7 @@ static int update_lsm_rule(struct audit_krule *r)
if (!security_audit_rule_known(r))
return 0;
- nentry = audit_dupe_rule(r);
+ nentry = audit_dupe_rule(r, NULL);
if (entry->rule.exe)
audit_remove_mark(entry->rule.exe);
if (IS_ERR(nentry)) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ab54fccba215..6610e667c728 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -150,7 +150,7 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
static int audit_match_perm(struct audit_context *ctx, int mask)
{
- unsigned n;
+ unsigned int n;
if (unlikely(!ctx))
return 0;
@@ -2786,7 +2786,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
- context->capset.cap.inheritable = new->cap_effective;
+ context->capset.cap.inheritable = new->cap_inheritable;
context->capset.cap.permitted = new->cap_permitted;
context->capset.cap.ambient = new->cap_ambient;
context->type = AUDIT_CAPSET;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 399007b67a92..4dc41bf5780c 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cnum.o log.o token.o liveness.o const_fold.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 802656c6fd3c..af49c154473d 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -53,12 +53,15 @@ struct bpf_arena {
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
+ struct page *scratch_page;
struct range_tree rt;
/* protects rt */
rqspinlock_t spinlock;
struct list_head vma_list;
/* protects vma_list */
struct mutex lock;
+ u64 zap_gen;
+ struct mutex zap_mutex;
struct irq_work free_irq;
struct work_struct free_work;
struct llist_head free_spans;
@@ -83,6 +86,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
return arena ? arena->user_vm_start : 0;
}
+/**
+ * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
+ * @map: a BPF_MAP_TYPE_ARENA map
+ *
+ * Return @map's kern_vm_start.
+ */
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
+{
+ return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
+}
+
+/**
+ * bpf_prog_arena - return the bpf_map of the arena referenced by @prog
+ * @prog: a loaded BPF program
+ *
+ * The verifier enforces at most one arena per program and stores it in
+ * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if
+ * @prog does not reference an arena.
+ */
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog)
+{
+ struct bpf_arena *arena = prog->aux->arena;
+
+ return arena ? &arena->map : NULL;
+}
+
static long arena_map_peek_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
@@ -115,26 +144,57 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
struct apply_range_data {
struct page **pages;
+ struct page *scratch_page;
int i;
};
+struct clear_range_data {
+ struct llist_head *free_pages;
+ struct page *scratch_page;
+};
+
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_data *d = data;
struct page *page;
+ pte_t pteval;
if (!data)
return 0;
- /* sanity check */
- if (unlikely(!pte_none(ptep_get(pte))))
- return -EBUSY;
page = d->pages[d->i];
/* paranoia, similar to vmap_pages_pte_range() */
if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
return -EINVAL;
- set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+ pteval = mk_pte(page, PAGE_KERNEL);
+#ifdef ptep_try_set
+ /*
+ * Kernel-fault recovery may have installed the scratch page here, and
+ * some architectures (arm64) prohibit valid->valid PTE transitions.
+ * Install atomically into a none slot. If scratch is present, clear it
+ * and flush_tlb_before_set() (break-before-make) before retrying.
+ */
+ while (!ptep_try_set(pte, pteval)) {
+ pte_t old = ptep_get(pte);
+
+ if (pte_none(old))
+ continue;
+ if (WARN_ON_ONCE(pte_page(old) != d->scratch_page))
+ return -EBUSY;
+ ptep_get_and_clear(&init_mm, addr, pte);
+ flush_tlb_before_set(addr);
+ }
+#else
+ /*
+ * Without ptep_try_set() there is no atomic installer, but such arches
+ * also do not wire up bpf_arena_handle_page_fault(), so no scratch page
+ * is ever installed and the slot is always none here.
+ */
+ if (unlikely(!pte_none(ptep_get(pte))))
+ return -EBUSY;
+ set_pte_at(&init_mm, addr, pte, pteval);
+#endif
d->i++;
return 0;
}
@@ -144,33 +204,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
flush_cache_vmap(start, start + size);
}
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
+ struct clear_range_data *d = data;
pte_t old_pte;
struct page *page;
- /* sanity check */
- old_pte = ptep_get(pte);
+ /*
+ * Pairs with ptep_try_set() in the kernel-fault scratch installer.
+ * Both sides must be atomic.
+ */
+ old_pte = ptep_get_and_clear(&init_mm, addr, pte);
if (pte_none(old_pte) || !pte_present(old_pte))
- return 0; /* nothing to do */
+ return 0;
page = pte_page(old_pte);
if (WARN_ON_ONCE(!page))
return -EINVAL;
- pte_clear(&init_mm, addr, pte);
+ /*
+ * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+ * scratches its PTE. A later bpf_arena_free_pages() over that range walks
+ * here. Without the skip, scratch_page would be freed.
+ */
+ if (page == d->scratch_page)
+ return 0;
+
+ __llist_add(&page->pcp_llist, d->free_pages);
+ return 0;
+}
- /* Add page to the list so it is freed later */
- if (free_pages)
- __llist_add(&page->pcp_llist, free_pages);
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *scratch_page = data;
+ if (!pte_none(ptep_get(pte)))
+ return 0;
+ /*
+ * Best-effort install. ptep_try_set() returns false only if another
+ * installer (real allocation or concurrent fault) won the cmpxchg.
+ * Their PTE is already valid, so the access retry succeeds.
+ *
+ * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
+ * cause one extra re-fault through this same path.
+ */
+ ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
return 0;
}
static int populate_pgtable_except_pte(struct bpf_arena *arena)
{
+ /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
}
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -221,22 +307,30 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
init_irq_work(&arena->free_irq, arena_free_irq);
INIT_WORK(&arena->free_work, arena_free_worker);
bpf_map_init_from_attr(&arena->map, attr);
+
+ err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+ if (err)
+ goto err_free_arena;
+
range_tree_init(&arena->rt);
err = range_tree_set(&arena->rt, 0, attr->max_entries);
- if (err) {
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_free_scratch;
mutex_init(&arena->lock);
+ mutex_init(&arena->zap_mutex);
raw_res_spin_lock_init(&arena->spinlock);
err = populate_pgtable_except_pte(arena);
- if (err) {
- range_tree_destroy(&arena->rt);
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_destroy_rt;
return &arena->map;
+
+err_destroy_rt:
+ range_tree_destroy(&arena->rt);
+err_free_scratch:
+ __free_page(arena->scratch_page);
+err_free_arena:
+ bpf_map_area_free(arena);
err:
free_vm_area(kern_vm);
return ERR_PTR(err);
@@ -244,6 +338,7 @@ err:
static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
{
+ struct bpf_arena *arena = data;
struct page *page;
pte_t pte;
@@ -252,6 +347,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
return 0;
page = pte_page(pte);
/*
+ * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+ * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+ */
+ if (page == arena->scratch_page)
+ return 0;
+ /*
* We do not update pte here:
* 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
* 2. TLB flushing is batched or deferred. Even if we clear pte,
@@ -286,9 +387,10 @@ static void arena_map_free(struct bpf_map *map)
* free those pages.
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
+ __free_page(arena->scratch_page);
bpf_map_area_free(arena);
}
@@ -318,6 +420,7 @@ struct vma_list {
struct vm_area_struct *vma;
struct list_head head;
refcount_t mmap_count;
+ u64 zap_gen;
};
static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
@@ -330,6 +433,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
refcount_set(&vml->mmap_count, 1);
vma->vm_private_data = vml;
vml->vma = vma;
+ vml->zap_gen = 0;
list_add(&vml->head, &arena->vma_list);
return 0;
}
@@ -384,33 +488,38 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_RETRY;
page = vmalloc_to_page((void *)kaddr);
- if (page)
+ if (page) {
+ if (page == arena->scratch_page)
+ /* BPF triggered scratch here; don't lazy-alloc over it */
+ goto out_sigsegv;
/* already have a page vmap-ed */
goto out;
+ }
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
/* User space requested to segfault when page is not allocated by bpf prog */
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
- struct apply_range_data data = { .pages = &page, .i = 0 };
+ struct apply_range_data data = { .pages = &page, .i = 0,
+ .scratch_page = arena->scratch_page };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
free_pages_nolock(page, 0);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
flush_vmap_cache(kaddr, PAGE_SIZE);
bpf_map_memcg_exit(old_memcg, new_memcg);
@@ -419,8 +528,9 @@ out:
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
vmf->page = page;
return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
return VM_FAULT_SIGSEGV;
}
@@ -511,7 +621,7 @@ static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32
{
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
- if ((u64)off > arena->user_vm_end - arena->user_vm_start)
+ if ((u64)off >= arena->user_vm_end - arena->user_vm_start)
return -ERANGE;
*imm = (unsigned long)arena->user_vm_start;
return 0;
@@ -587,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
return 0;
}
data.pages = pages;
+ data.scratch_page = arena->scratch_page;
if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
goto out_free_pages;
@@ -668,12 +779,60 @@ out_free_pages:
*/
static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{
+ unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
struct vma_list *vml;
+ unsigned long vm_start;
+ u64 my_gen;
- guard(mutex)(&arena->lock);
- /* iterate link list under lock */
- list_for_each_entry(vml, &arena->vma_list, head)
- zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt);
+ /*
+ * Taking mmap_read_lock() under arena->lock would deadlock against
+ * arena_vm_close(), which runs with mmap_write_lock held and then
+ * acquires arena->lock. Drop arena->lock for mmap_read_lock().
+ *
+ * Use per-call my_gen, recorded in vml->zap_gen, to remember which
+ * vmls this invocation has already processed across the lock drop.
+ * Hold zap_mutex around the whole walk so concurrent zap_pages()
+ * callers cannot overwrite each other's marks on shared vmls --
+ * otherwise call B's mark would make call A skip a vml that A has
+ * not yet zapped for A's uaddr range.
+ */
+ mutex_lock(&arena->zap_mutex);
+ mutex_lock(&arena->lock);
+ my_gen = ++arena->zap_gen;
+ for (;;) {
+ mm = NULL;
+ list_for_each_entry(vml, &arena->vma_list, head) {
+ if (vml->zap_gen >= my_gen)
+ continue;
+ vml->zap_gen = my_gen;
+ if (!mmget_not_zero(vml->vma->vm_mm))
+ continue;
+ mm = vml->vma->vm_mm;
+ vm_start = vml->vma->vm_start;
+ break;
+ }
+ if (!mm)
+ break;
+ mutex_unlock(&arena->lock);
+
+ mmap_read_lock(mm);
+ /*
+ * Re-resolve: while we waited the VMA could have been unmapped
+ * and a different mapping installed at the same address.
+ */
+ vma = find_vma(mm, vm_start);
+ if (vma && vma->vm_start == vm_start &&
+ vma->vm_file && vma->vm_file->private_data == &arena->map)
+ zap_vma_range(vma, uaddr, size);
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ mutex_lock(&arena->lock);
+ }
+ mutex_unlock(&arena->lock);
+ mutex_unlock(&arena->zap_mutex);
}
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
@@ -685,6 +844,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
struct llist_head free_pages;
struct llist_node *pos, *t;
struct arena_free_span *s;
+ struct clear_range_data cdata;
unsigned long flags;
int ret = 0;
@@ -713,9 +873,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
range_tree_set(&arena->rt, pgoff, page_cnt);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
/* clear ptes and collect struct pages */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
/* drop the lock to do the tlb flush and zap pages */
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -805,6 +967,7 @@ static void arena_free_worker(struct work_struct *work)
struct arena_free_span *s;
u64 arena_vm_start, user_vm_start;
struct llist_head free_pages;
+ struct clear_range_data cdata;
struct page *page;
unsigned long full_uaddr;
long kaddr, page_cnt, pgoff;
@@ -818,6 +981,8 @@ static void arena_free_worker(struct work_struct *work)
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
arena_vm_start = bpf_arena_get_kern_vm_start(arena);
user_vm_start = bpf_arena_get_user_vm_start(arena);
@@ -830,7 +995,7 @@ static void arena_free_worker(struct work_struct *work)
/* clear ptes and collect pages in free_pages llist */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
range_tree_set(&arena->rt, pgoff, page_cnt);
}
@@ -893,6 +1058,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
}
+
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+}
+
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
struct bpf_map *map = p__map;
@@ -945,23 +1123,12 @@ static int __init kfunc_init(void)
}
late_initcall(kfunc_init);
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+ unsigned long addr, unsigned long fault_ip)
{
struct bpf_stream_stage ss;
- struct bpf_prog *prog;
u64 user_vm_start;
- /*
- * The RCU read lock is held to safely traverse the latch tree, but we
- * don't need its protection when accessing the prog, since it will not
- * disappear while we are handling the fault.
- */
- rcu_read_lock();
- prog = bpf_prog_ksym_find(fault_ip);
- rcu_read_unlock();
- if (!prog)
- return;
-
/* Use main prog for stream access */
prog = prog->aux->main_prog_aux->prog;
@@ -974,3 +1141,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
bpf_stream_dump_stack(ss);
}));
}
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+ struct bpf_arena *arena;
+ struct bpf_prog *prog;
+ unsigned long kbase;
+ unsigned long page_addr = addr & PAGE_MASK;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return false;
+
+ arena = prog->aux->arena;
+ /* a prog not using arena may be on stack, so arena can be NULL */
+ if (!arena)
+ return false;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+
+ /*
+ * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+ * Lower guard is unreachable from kfuncs; an address there indicates
+ * a different bug class - leave it to the regular kernel oops path.
+ */
+ if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+ return false;
+
+ apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+ apply_range_set_scratch_cb, arena->scratch_page);
+ flush_vmap_cache(page_addr, PAGE_SIZE);
+ __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+ return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+ __bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5e25e0353509..248b4818178c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -175,14 +175,12 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + (u64)array->elem_size * (index & array->index_mask);
}
-static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
- void *hash_buf)
+static int array_map_get_hash(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
sha256(array->value, (u64)array->elem_size * array->map.max_entries,
- hash_buf);
- memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
+ array->map.sha);
return 0;
}
@@ -386,7 +384,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
copy_map_value(map, val, value);
- bpf_obj_free_fields(array->map.record, val);
+ bpf_obj_cancel_fields(map, val);
} else {
val = array->value +
(u64)array->elem_size * (index & array->index_mask);
@@ -394,7 +392,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
- bpf_obj_free_fields(array->map.record, val);
+ bpf_obj_cancel_fields(map, val);
}
return 0;
}
@@ -434,14 +432,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
cpu = map_flags >> 32;
ptr = per_cpu_ptr(pptr, cpu);
copy_map_value(map, ptr, value);
- bpf_obj_free_fields(array->map.record, ptr);
+ bpf_obj_cancel_fields(map, ptr);
goto unlock;
}
for_each_possible_cpu(cpu) {
ptr = per_cpu_ptr(pptr, cpu);
val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
copy_map_value(map, ptr, val);
- bpf_obj_free_fields(array->map.record, ptr);
+ bpf_obj_cancel_fields(map, ptr);
}
unlock:
rcu_read_unlock();
@@ -827,7 +825,7 @@ const struct bpf_map_ops array_map_ops = {
};
const struct bpf_map_ops percpu_array_map_ops = {
- .map_meta_equal = bpf_map_meta_equal,
+ .map_meta_equal = array_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c
index 854731dc93fe..2e4ae0ef0860 100644
--- a/kernel/bpf/backtrack.c
+++ b/kernel/bpf/backtrack.c
@@ -9,7 +9,7 @@
/* for any branch, call, exit record the history of jmps in the given state */
int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags, u64 linked_regs)
+ int insn_flags, int spi, int frame, u64 linked_regs)
{
u32 cnt = cur->jmp_history_cnt;
struct bpf_jmp_history_entry *p;
@@ -25,6 +25,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state
env, "insn history: insn_idx %d cur flags %x new flags %x",
env->insn_idx, env->cur_hist_ent->flags, insn_flags);
env->cur_hist_ent->flags |= insn_flags;
+ env->cur_hist_ent->spi = spi;
+ env->cur_hist_ent->frame = frame;
verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
"insn history: insn_idx %d linked_regs: %#llx",
env->insn_idx, env->cur_hist_ent->linked_regs);
@@ -43,6 +45,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state
p->idx = env->insn_idx;
p->prev_idx = env->prev_insn_idx;
p->flags = insn_flags;
+ p->spi = spi;
+ p->frame = frame;
p->linked_regs = linked_regs;
cur->jmp_history_cnt = cnt;
env->cur_hist_ent = p;
@@ -64,16 +68,6 @@ static bool is_atomic_fetch_insn(const struct bpf_insn *insn)
(insn->imm & BPF_FETCH);
}
-static int insn_stack_access_spi(int insn_flags)
-{
- return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
-}
-
-static int insn_stack_access_frameno(int insn_flags)
-{
- return insn_flags & INSN_F_FRAMENO_MASK;
-}
-
/* Backtrack one insn at a time. If idx is not at the top of recorded
* history then previous instruction came from straight line execution.
* Return -ENOENT if we exhausted all instructions within given state.
@@ -135,11 +129,21 @@ static inline u32 bt_empty(struct backtrack_state *bt)
int i;
for (i = 0; i <= bt->frame; i++)
- mask |= bt->reg_masks[i] | bt->stack_masks[i];
+ mask |= bt->reg_masks[i] | bt->stack_masks[i] | bt->stack_arg_masks[i];
return mask == 0;
}
+static inline void bt_clear_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_arg_masks[frame] &= ~(1 << slot);
+}
+
+static inline bool bt_is_frame_stack_arg_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ return bt->stack_arg_masks[frame] & (1 << slot);
+}
+
static inline int bt_subprog_enter(struct backtrack_state *bt)
{
if (bt->frame == MAX_CALL_FRAMES - 1) {
@@ -200,6 +204,11 @@ static inline u64 bt_stack_mask(struct backtrack_state *bt)
return bt->stack_masks[bt->frame];
}
+static inline u8 bt_stack_arg_mask(struct backtrack_state *bt)
+{
+ return bt->stack_arg_masks[bt->frame];
+}
+
static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
{
return bt->reg_masks[bt->frame] & (1 << reg);
@@ -341,6 +350,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
return 0;
bt_clear_reg(bt, load_reg);
+ if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) {
+ spi = hist->spi;
+ /*
+ * Stack arg read: callee reads from r11+off, but
+ * the data lives in the caller's stack_arg_regs.
+ * Set the mask in the caller frame so precision
+ * is marked in the caller's slot at the callee
+ * entry checkpoint.
+ */
+ bt_set_frame_stack_arg_slot(bt, bt->frame - 1, spi);
+ return 0;
+ }
+
/* scalars can only be spilled into stack w/o losing precision.
* Load from any other memory can be zero extended.
* The desire to keep that precision is already indicated
@@ -353,8 +375,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* that [fp - off] slot contains scalar that needs to be
* tracked with precision
*/
- spi = insn_stack_access_spi(hist->flags);
- fr = insn_stack_access_frameno(hist->flags);
+ spi = hist->spi;
+ fr = hist->frame;
bpf_bt_set_frame_slot(bt, fr, spi);
} else if (class == BPF_STX || class == BPF_ST) {
if (bt_is_reg_set(bt, dreg))
@@ -363,11 +385,22 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* encountered a case of pointer subtraction.
*/
return -ENOTSUPP;
+
+ if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) {
+ spi = hist->spi;
+ if (!bt_is_frame_stack_arg_slot_set(bt, bt->frame, spi))
+ return 0;
+ bt_clear_frame_stack_arg_slot(bt, bt->frame, spi);
+ if (class == BPF_STX)
+ bt_set_reg(bt, sreg);
+ return 0;
+ }
+
/* scalars can only be spilled into stack */
if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
- spi = insn_stack_access_spi(hist->flags);
- fr = insn_stack_access_frameno(hist->flags);
+ spi = hist->spi;
+ fr = hist->frame;
if (!bt_is_frame_slot_set(bt, fr, spi))
return 0;
bt_clear_frame_slot(bt, fr, spi);
@@ -431,6 +464,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
bpf_bt_set_frame_reg(bt, bt->frame - 1, i);
}
}
+ if (bt_stack_arg_mask(bt)) {
+ verifier_bug(env,
+ "static subprog leftover stack arg slots %x",
+ bt_stack_arg_mask(bt));
+ return -EFAULT;
+ }
if (bt_subprog_exit(bt))
return -EFAULT;
return 0;
@@ -901,6 +940,17 @@ int bpf_mark_chain_precision(struct bpf_verifier_env *env,
*changed = true;
}
}
+ for (i = 0; i < func->out_stack_arg_cnt; i++) {
+ if (!bt_is_frame_stack_arg_slot_set(bt, fr, i))
+ continue;
+ reg = &func->stack_arg_regs[i];
+ if (reg->type != SCALAR_VALUE || reg->precise) {
+ bt_clear_frame_stack_arg_slot(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
if (env->log.level & BPF_LOG_LEVEL2) {
fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
bt_frame_reg_mask(bt, fr));
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e7a2fc60523f..5ed7cb4b98c0 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,23 +13,8 @@
#define PERCPU_FREE_TARGET (4)
#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
-/* Helpers to get the local list index */
-#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
-#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
-#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
-/* Local list helpers */
-static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
-{
- return &loc_l->lists[LOCAL_FREE_LIST_IDX];
-}
-
-static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
-{
- return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
-}
-
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
@@ -72,6 +57,7 @@ static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
bpf_lru_list_count_dec(l, node->type);
node->type = tgt_free_type;
+ WRITE_ONCE(node->pending_free, 0);
list_move(&node->list, free_list);
}
@@ -87,6 +73,9 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
bpf_lru_node_clear_ref(node);
+ /* Reset pending_free only when moving to the free list */
+ if (tgt_type == BPF_LRU_LIST_T_FREE)
+ WRITE_ONCE(node->pending_free, 0);
list_move(&node->list, &l->lists[tgt_type]);
}
@@ -212,9 +201,11 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
unsigned int i = 0;
list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
- if (bpf_lru_node_is_ref(node)) {
+ if (bpf_lru_node_is_ref(node) &&
+ !READ_ONCE(node->pending_free)) {
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
- } else if (lru->del_from_htab(lru->del_arg, node)) {
+ } else if (READ_ONCE(node->pending_free) ||
+ lru->del_from_htab(lru->del_arg, node)) {
__bpf_lru_node_move_to_free(l, node, free_list,
tgt_free_type);
if (++nshrinked == tgt_nshrink)
@@ -273,7 +264,8 @@ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
list) {
- if (lru->del_from_htab(lru->del_arg, node)) {
+ if (READ_ONCE(node->pending_free) ||
+ lru->del_from_htab(lru->del_arg, node)) {
__bpf_lru_node_move_to_free(l, node, free_list,
tgt_free_type);
return 1;
@@ -290,8 +282,10 @@ static void __local_list_flush(struct bpf_lru_list *l,
struct bpf_lru_node *node, *tmp_node;
list_for_each_entry_safe_reverse(node, tmp_node,
- local_pending_list(loc_l), list) {
- if (bpf_lru_node_is_ref(node))
+ &loc_l->pending_list, list) {
+ if (READ_ONCE(node->pending_free))
+ __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_FREE);
+ else if (bpf_lru_node_is_ref(node))
__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
else
__bpf_lru_node_move_in(l, node,
@@ -307,9 +301,12 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
return;
- raw_spin_lock_irqsave(&l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&l->lock, flags)) {
+ WRITE_ONCE(node->pending_free, 1);
+ return;
+ }
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
- raw_spin_unlock_irqrestore(&l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&l->lock, flags);
}
static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
@@ -318,8 +315,10 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
struct bpf_lru_list *l = &lru->common_lru.lru_list;
struct bpf_lru_node *node, *tmp_node;
unsigned int nfree = 0;
+ LIST_HEAD(tmp_free);
- raw_spin_lock(&l->lock);
+ if (raw_res_spin_lock(&l->lock))
+ return;
__local_list_flush(l, loc_l);
@@ -327,7 +326,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
list) {
- __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
+ __bpf_lru_node_move_to_free(l, node, &tmp_free,
BPF_LRU_LOCAL_LIST_T_FREE);
if (++nfree == lru->target_free)
break;
@@ -335,10 +334,19 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
if (nfree < lru->target_free)
__bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
- local_free_list(loc_l),
+ &tmp_free,
BPF_LRU_LOCAL_LIST_T_FREE);
- raw_spin_unlock(&l->lock);
+ raw_res_spin_unlock(&l->lock);
+
+ /*
+ * Transfer the harvested nodes from the temporary list_head into
+ * the lockless per-CPU free llist.
+ */
+ list_for_each_entry_safe(node, tmp_node, &tmp_free, list) {
+ list_del(&node->list);
+ llist_add(&node->llist, &loc_l->free_llist);
+ }
}
static void __local_list_add_pending(struct bpf_lru *lru,
@@ -350,22 +358,21 @@ static void __local_list_add_pending(struct bpf_lru *lru,
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
+ WRITE_ONCE(node->pending_free, 0);
bpf_lru_node_clear_ref(node);
- list_add(&node->list, local_pending_list(loc_l));
+ list_add(&node->list, &loc_l->pending_list);
}
static struct bpf_lru_node *
__local_list_pop_free(struct bpf_lru_locallist *loc_l)
{
- struct bpf_lru_node *node;
+ struct llist_node *llnode;
- node = list_first_entry_or_null(local_free_list(loc_l),
- struct bpf_lru_node,
- list);
- if (node)
- list_del(&node->list);
+ llnode = llist_del_first(&loc_l->free_llist);
+ if (!llnode)
+ return NULL;
- return node;
+ return container_of(llnode, struct bpf_lru_node, llist);
}
static struct bpf_lru_node *
@@ -376,10 +383,10 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
ignore_ref:
/* Get from the tail (i.e. older element) of the pending list. */
- list_for_each_entry_reverse(node, local_pending_list(loc_l),
- list) {
+ list_for_each_entry_reverse(node, &loc_l->pending_list, list) {
if ((!bpf_lru_node_is_ref(node) || force) &&
- lru->del_from_htab(lru->del_arg, node)) {
+ (READ_ONCE(node->pending_free) ||
+ lru->del_from_htab(lru->del_arg, node))) {
list_del(&node->list);
return node;
}
@@ -404,7 +411,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
l = per_cpu_ptr(lru->percpu_lru, cpu);
- raw_spin_lock_irqsave(&l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&l->lock, flags))
+ return NULL;
__bpf_lru_list_rotate(lru, l);
@@ -420,7 +428,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
- raw_spin_unlock_irqrestore(&l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&l->lock, flags);
return node;
}
@@ -437,7 +445,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
loc_l = per_cpu_ptr(clru->local_list, cpu);
- raw_spin_lock_irqsave(&loc_l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))
+ return NULL;
node = __local_list_pop_free(loc_l);
if (!node) {
@@ -448,17 +457,22 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
if (node)
__local_list_add_pending(lru, loc_l, cpu, node, hash);
- raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
if (node)
return node;
- /* No free nodes found from the local free list and
+ /*
+ * No free nodes found from the local free list and
* the global LRU list.
*
* Steal from the local free/pending list of the
* current CPU and remote CPU in RR. It starts
* with the loc_l->next_steal CPU.
+ *
+ * Acquire the victim's lock before touching either list. On
+ * acquisition failure (rqspinlock AA or timeout) skip the victim
+ * and try the next CPU.
*/
first_steal = loc_l->next_steal;
@@ -466,24 +480,36 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
do {
steal_loc_l = per_cpu_ptr(clru->local_list, steal);
- raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
-
- node = __local_list_pop_free(steal_loc_l);
- if (!node)
- node = __local_list_pop_pending(lru, steal_loc_l);
-
- raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+ if (!raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) {
+ node = __local_list_pop_free(steal_loc_l);
+ if (!node)
+ node = __local_list_pop_pending(lru, steal_loc_l);
+ raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+ }
steal = cpumask_next_wrap(steal, cpu_possible_mask);
} while (!node && steal != first_steal);
loc_l->next_steal = steal;
- if (node) {
- raw_spin_lock_irqsave(&loc_l->lock, flags);
- __local_list_add_pending(lru, loc_l, cpu, node, hash);
- raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ if (!node)
+ return NULL;
+
+ if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) {
+ /*
+ * The local pending lock can't be acquired (rqspinlock AA
+ * or timeout). Return the stolen node to the per-CPU
+ * free_llist instead of orphaning it; the next pop_free on
+ * this CPU will pick it up.
+ */
+ node->type = BPF_LRU_LOCAL_LIST_T_FREE;
+ bpf_lru_node_clear_ref(node);
+ WRITE_ONCE(node->pending_free, 0);
+ llist_add(&node->llist, &loc_l->free_llist);
+ return NULL;
}
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+ raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
return node;
}
@@ -511,18 +537,24 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
- raw_spin_lock_irqsave(&loc_l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) {
+ WRITE_ONCE(node->pending_free, 1);
+ return;
+ }
if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
- raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&loc_l->lock,
+ flags);
goto check_lru_list;
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
bpf_lru_node_clear_ref(node);
- list_move(&node->list, local_free_list(loc_l));
+ list_del(&node->list);
+
+ raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
- raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ llist_add(&node->llist, &loc_l->free_llist);
return;
}
@@ -538,11 +570,14 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
l = per_cpu_ptr(lru->percpu_lru, node->cpu);
- raw_spin_lock_irqsave(&l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&l->lock, flags)) {
+ WRITE_ONCE(node->pending_free, 1);
+ return;
+ }
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
- raw_spin_unlock_irqrestore(&l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&l->lock, flags);
}
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
@@ -565,6 +600,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
+ node->pending_free = 0;
bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
@@ -594,6 +630,7 @@ again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
+ node->pending_free = 0;
bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
@@ -618,14 +655,12 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
{
- int i;
-
- for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
- INIT_LIST_HEAD(&loc_l->lists[i]);
+ INIT_LIST_HEAD(&loc_l->pending_list);
+ init_llist_head(&loc_l->free_llist);
loc_l->next_steal = cpu;
- raw_spin_lock_init(&loc_l->lock);
+ raw_res_spin_lock_init(&loc_l->lock);
}
static void bpf_lru_list_init(struct bpf_lru_list *l)
@@ -640,7 +675,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
- raw_spin_lock_init(&l->lock);
+ raw_res_spin_lock_init(&l->lock);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index fe2661a58ea9..8d0ee61622af 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -6,11 +6,11 @@
#include <linux/cache.h>
#include <linux/list.h>
-#include <linux/spinlock_types.h>
+#include <linux/llist.h>
+#include <asm/rqspinlock.h>
#define NR_BPF_LRU_LIST_T (3)
#define NR_BPF_LRU_LIST_COUNT (2)
-#define NR_BPF_LRU_LOCAL_LIST_T (2)
#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
enum bpf_lru_list_type {
@@ -22,10 +22,22 @@ enum bpf_lru_list_type {
};
struct bpf_lru_node {
- struct list_head list;
+ /*
+ * A node is in at most one list at a time. The free path on the
+ * per-CPU locallist uses an llist, so share storage via a union.
+ */
+ union {
+ struct list_head list;
+ struct llist_node llist;
+ };
u16 cpu;
u8 type;
u8 ref;
+ /*
+ * Marks nodes whose *_push_free() lock acquire failed; reclaimed
+ * by flush/shrink which honor the flag instead of del_from_htab().
+ */
+ u8 pending_free;
};
struct bpf_lru_list {
@@ -34,13 +46,14 @@ struct bpf_lru_list {
/* The next inactive list rotation starts from here */
struct list_head *next_inactive_rotation;
- raw_spinlock_t lock ____cacheline_aligned_in_smp;
+ rqspinlock_t lock ____cacheline_aligned_in_smp;
};
struct bpf_lru_locallist {
- struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+ struct list_head pending_list;
+ struct llist_head free_llist;
u16 next_steal;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
};
struct bpf_common_lru {
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index c5c925f00202..564071a92d7d 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -427,6 +427,26 @@ BTF_ID(func, bpf_lsm_audit_rule_known)
BTF_ID(func, bpf_lsm_inode_xattr_skipcap)
BTF_SET_END(bool_lsm_hooks)
+/* hooks returning void */
+#define LSM_HOOK_void(DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
+#define LSM_HOOK_int(DEFAULT, NAME, ...) /* nothing */
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) LSM_HOOK_##RET(DEFAULT, NAME, __VA_ARGS__)
+BTF_SET_START(void_lsm_hooks)
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+#undef LSM_HOOK_void
+#undef LSM_HOOK_int
+BTF_SET_END(void_lsm_hooks)
+
+bool bpf_lsm_hook_returns_errno(u32 btf_id)
+{
+ if (btf_id_set_contains(&bool_lsm_hooks, btf_id))
+ return false;
+ if (btf_id_set_contains(&void_lsm_hooks, btf_id))
+ return false;
+ return true;
+}
+
int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
struct bpf_retval_range *retval_range)
{
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 521cb9d7e8c7..51b16e5f5534 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -594,8 +594,8 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
.dealloc = bpf_struct_ops_link_dealloc,
};
-int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
- struct bpf_tramp_link *link,
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes,
+ struct bpf_tramp_node *node,
const struct btf_func_model *model,
void *stub_func,
void **_image, u32 *_image_off,
@@ -605,13 +605,13 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
void *image = *_image;
int size;
- tlinks[BPF_TRAMP_FENTRY].links[0] = link;
- tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+ tnodes[BPF_TRAMP_FENTRY].nodes[0] = node;
+ tnodes[BPF_TRAMP_FENTRY].nr_nodes = 1;
if (model->ret_size > 0)
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
- size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func);
+ size = arch_bpf_trampoline_size(model, flags, tnodes, stub_func);
if (size <= 0)
return size ? : -EFAULT;
@@ -628,7 +628,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
size = arch_prepare_bpf_trampoline(NULL, image + image_off,
image + image_off + size,
- model, flags, tlinks, stub_func);
+ model, flags, tnodes, stub_func);
if (size <= 0) {
if (image != *_image)
bpf_struct_ops_image_free(image);
@@ -693,7 +693,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
const struct btf_type *module_type;
const struct btf_member *member;
const struct btf_type *t = st_ops_desc->type;
- struct bpf_tramp_links *tlinks;
+ struct bpf_tramp_nodes *tnodes;
void *udata, *kdata;
int prog_fd, err;
u32 i, trampoline_start, image_off = 0;
@@ -720,8 +720,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (uvalue->common.state || refcount_read(&uvalue->common.refcnt))
return -EINVAL;
- tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX);
- if (!tlinks)
+ tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX);
+ if (!tnodes)
return -ENOMEM;
uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
@@ -817,8 +817,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
err = -ENOMEM;
goto reset_unlock;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
- &bpf_struct_ops_link_lops, prog, prog->expected_attach_type);
+ bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS,
+ &bpf_struct_ops_link_lops, prog, prog->expected_attach_type, 0);
+
*plink++ = &link->link;
/* Poison pointer on error instead of return for backward compatibility */
@@ -832,7 +833,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*pksym++ = ksym;
trampoline_start = image_off;
- err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node,
&st_ops->func_models[i],
*(void **)(st_ops->cfi_stubs + moff),
&image, &image_off,
@@ -911,7 +912,7 @@ reset_unlock:
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
unlock:
- kfree(tlinks);
+ kfree(tnodes);
mutex_unlock(&st_map->lock);
if (!err)
bpf_struct_ops_map_add_ksyms(st_map);
@@ -1204,6 +1205,42 @@ u32 bpf_struct_ops_id(const void *kdata)
}
EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
+/**
+ * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog
+ * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg)
+ * @cb: callback invoked once per member prog; non-zero return stops iteration
+ * @data: opaque argument passed to @cb
+ *
+ * Walks the struct_ops member progs registered on the map containing @kdata.
+ * Intended for use from struct_ops ->reg() callbacks (and similar) that need to
+ * inspect the loaded BPF programs (for example to discover maps they reference
+ * via @prog->aux->used_maps).
+ *
+ * Return 0 if iteration completed, otherwise the first non-zero @cb return.
+ */
+int bpf_struct_ops_for_each_prog(const void *kdata,
+ int (*cb)(struct bpf_prog *prog, void *data),
+ void *data)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ u32 i;
+ int ret;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ continue;
+ ret = cb(st_map->links[i]->prog, data);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog);
+
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a62d78581207..15ae7c43f594 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -182,7 +182,6 @@
#define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
-#define BTF_INFO_MASK 0x9f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -289,7 +288,7 @@ enum verifier_phase {
struct resolve_vertex {
const struct btf_type *t;
u32 type_id;
- u16 next_member;
+ u32 next_member;
};
enum visit_state {
@@ -2031,7 +2030,7 @@ static int env_stack_push(struct btf_verifier_env *env,
}
static void env_stack_set_next_member(struct btf_verifier_env *env,
- u16 next_member)
+ u32 next_member)
{
env->stack[env->top_stack - 1].next_member = next_member;
}
@@ -3293,7 +3292,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
struct btf *btf = env->btf;
u32 struct_size = t->size;
u32 offset;
- u16 i;
+ u32 i;
meta_needed = btf_type_vlen(t) * sizeof(*member);
if (meta_left < meta_needed) {
@@ -3369,7 +3368,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
{
const struct btf_member *member;
int err;
- u16 i;
+ u32 i;
/* Before continue resolving the next_member,
* ensure the last member is indeed resolved to a
@@ -3668,7 +3667,7 @@ end:
static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
u32 field_cnt, u32 repeat_cnt, u32 elem_size)
{
- u32 i, j;
+ u32 i, j, total_cnt, total_repeats;
u32 cur;
/* Ensure not repeating fields that should not be repeated. */
@@ -3686,10 +3685,9 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
}
}
- /* The type of struct size or variable size is u32,
- * so the multiplication will not overflow.
- */
- if (field_cnt * (repeat_cnt + 1) > info_cnt)
+ if (check_add_overflow(repeat_cnt, 1, &total_repeats) ||
+ check_mul_overflow(field_cnt, total_repeats, &total_cnt) ||
+ total_cnt > (u32)info_cnt)
return -E2BIG;
cur = field_cnt;
@@ -4447,7 +4445,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
const struct btf_enum *enums = btf_type_enum(t);
struct btf *btf = env->btf;
const char *fmt_str;
- u16 i, nr_enums;
+ u32 i, nr_enums;
u32 meta_needed;
nr_enums = btf_type_vlen(t);
@@ -4555,7 +4553,7 @@ static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
const struct btf_enum64 *enums = btf_type_enum64(t);
struct btf *btf = env->btf;
const char *fmt_str;
- u16 i, nr_enums;
+ u32 i, nr_enums;
u32 meta_needed;
nr_enums = btf_type_vlen(t);
@@ -4683,7 +4681,7 @@ static void btf_func_proto_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
const struct btf_param *args = (const struct btf_param *)(t + 1);
- u16 nr_args = btf_type_vlen(t), i;
+ u32 nr_args = btf_type_vlen(t), i;
btf_verifier_log(env, "return=%u args=(", t->type);
if (!nr_args) {
@@ -4929,7 +4927,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env,
{
const struct btf_var_secinfo *vsi;
struct btf *btf = env->btf;
- u16 i;
+ u32 i;
env->resolve_mode = RESOLVE_TBD;
for_each_vsi_from(i, v->next_member, v->t, vsi) {
@@ -5183,7 +5181,7 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *ret_type;
const struct btf_param *args;
const struct btf *btf;
- u16 nr_args, i;
+ u32 nr_args, i;
int err;
btf = env->btf;
@@ -5278,7 +5276,7 @@ static int btf_func_check(struct btf_verifier_env *env,
const struct btf_type *proto_type;
const struct btf_param *args;
const struct btf *btf;
- u16 nr_args, i;
+ u32 nr_args, i;
btf = env->btf;
proto_type = btf_type_by_id(btf, t->type);
@@ -5336,12 +5334,6 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
}
meta_left -= sizeof(*t);
- if (t->info & ~BTF_INFO_MASK) {
- btf_verifier_log(env, "[%u] Invalid btf_info:%x",
- env->log_type_id, t->info);
- return -EINVAL;
- }
-
if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
btf_verifier_log(env, "[%u] Invalid kind:%u",
@@ -5914,25 +5906,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
return 0;
}
-static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
-{
- u32 log_true_size;
- int err;
-
- err = bpf_vlog_finalize(log, &log_true_size);
-
- if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
- copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
- &log_true_size, sizeof(log_true_size)))
- err = -EFAULT;
-
- return err;
-}
-
-static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_log_attr *attr_log)
{
bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
- char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
struct btf *btf = NULL;
@@ -5949,8 +5926,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
/* user could have requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- err = bpf_vlog_init(&env->log, attr->btf_log_level,
- log_ubuf, attr->btf_log_size);
+ err = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size);
if (err)
goto errout_free;
@@ -6015,7 +5991,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
}
}
- err = finalize_log(&env->log, uattr, uattr_size);
+ err = bpf_log_attr_finalize(attr_log, &env->log);
if (err)
goto errout_free;
@@ -6027,7 +6003,7 @@ errout_meta:
btf_free_struct_meta_tab(btf);
errout:
/* overwrite err with -ENOSPC or -EFAULT */
- ret = finalize_log(&env->log, uattr, uattr_size);
+ ret = bpf_log_attr_finalize(attr_log, &env->log);
if (ret)
err = ret;
errout_free:
@@ -6980,7 +6956,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->reg_type = ctx_arg_info->reg_type;
info->btf = ctx_arg_info->btf ? : btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
- info->ref_obj_id = ctx_arg_info->ref_obj_id;
+ info->ref_id = ctx_arg_info->ref_id;
return true;
}
}
@@ -7825,6 +7801,134 @@ enum btf_arg_tag {
ARG_TAG_ARENA = BIT_ULL(5),
};
+static int btf_scan_decl_tags(struct bpf_verifier_env *env,
+ const struct btf *btf,
+ const struct btf_type *fn_t,
+ u32 arg_idx, bool is_global, u32 *tags)
+{
+ int id = btf_named_start_id(btf, false) - 1;
+ const char tag_key[] = "arg:";
+ static const struct {
+ const char *tag_value;
+ enum btf_arg_tag arg_tag;
+ } tag_values[] = {
+ { "ctx", ARG_TAG_CTX },
+ { "trusted", ARG_TAG_TRUSTED },
+ { "untrusted", ARG_TAG_UNTRUSTED },
+ { "nonnull", ARG_TAG_NONNULL },
+ { "nullable", ARG_TAG_NULLABLE },
+ { "arena", ARG_TAG_ARENA },
+ };
+
+ /*
+ * The 'arg:<tag>' decl_tag takes precedence over the derivation
+ * of the register type from the BTF type itself.
+ */
+ while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, tag_key, id)) > 0) {
+ const struct btf_type *tag_t;
+ const char *tag;
+ int i;
+ bool found;
+
+ /* disallow arg tags in static subprogs */
+ if (!is_global) {
+ bpf_log(&env->log,
+ "arg#%d type tag is not supported in static functions\n",
+ arg_idx);
+ return -EOPNOTSUPP;
+ }
+
+ tag_t = btf_type_by_id(btf, id);
+ tag = __btf_name_by_offset(btf, tag_t->name_off) + (sizeof(tag_key) - 1);
+
+ found = false;
+ for (i = 0; i < ARRAY_SIZE(tag_values); ++i) {
+ if (!strcmp(tag, tag_values[i].tag_value)) {
+ *tags |= tag_values[i].arg_tag;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ bpf_log(&env->log, "arg#%d has unsupported set of tags\n", arg_idx);
+ return -EOPNOTSUPP;
+ }
+ }
+ if (id != -ENOENT) {
+ bpf_log(&env->log, "arg#%d type tag fetching failure: %d\n", arg_idx, id);
+ return id;
+ }
+
+ return 0;
+}
+
+static int btf_scan_type_tags(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 type_id,
+ u32 *tags)
+{
+ const struct btf_type *t;
+
+ /* Find the first pointer type in the chain. */
+ t = btf_type_skip_modifiers(btf, type_id, NULL);
+
+ /*
+ * We currently reject type tags on non-pointer types,
+ * which neither LLVM nor GCC support anyway.
+ */
+ if (!t || !btf_type_is_ptr(t))
+ return 0;
+
+ /* We got a pointer, get all associated type tags. */
+ for (t = btf_type_by_id(btf, t->type); t && btf_type_is_modifier(t);
+ t = btf_type_by_id(btf, t->type)) {
+
+ /* Skip non-type tag modifiers. */
+ if (!btf_type_is_type_tag(t))
+ continue;
+
+ const char *tag = __btf_name_by_offset(btf, t->name_off);
+
+ if (strcmp(tag, "arena") == 0) {
+ *tags |= ARG_TAG_ARENA;
+ } else {
+ bpf_log(&env->log, "function signature member has unsupported type tag '%s'\n",
+ tag);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ return 0;
+}
+
+/* Check whether the type is a valid return type. */
+static int btf_validate_return_type(struct bpf_verifier_env *env, struct btf *btf,
+ const struct btf_type *t, int subprog)
+{
+ u32 tags = 0;
+ int err;
+
+ err = btf_scan_type_tags(env, btf, t->type, &tags);
+ if (err)
+ return err;
+
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+ /*
+ * We allow all subprogs except for the main one to return any kind of arena pointer.
+ * General arena variables are not allowed, since it makes no sense to return by value
+ * a variable that's on the heap in the first place.
+ */
+ if (subprog && (tags & ARG_TAG_ARENA) && btf_type_is_ptr(t))
+ return 0;
+
+ /* We always accept void or scalars. */
+ if (btf_type_is_void(t) || btf_type_is_int(t) || btf_is_any_enum(t))
+ return 0;
+
+ return -EOPNOTSUPP;
+}
+
/* Process BTF of a function to produce high-level expectation of function
* arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
* is cached in subprog info for reuse.
@@ -7843,6 +7947,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
struct btf *btf = prog->aux->btf;
const struct btf_param *args;
const struct btf_type *t, *ref_t, *fn_t;
+ int err;
u32 i, nargs, btf_id;
const char *tname;
@@ -7887,25 +7992,36 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
+ sub->arg_cnt = nargs;
+ if (nargs > MAX_BPF_FUNC_ARGS) {
+ bpf_log(log, "kernel supports at most %d parameters, function %s has %d\n",
+ MAX_BPF_FUNC_ARGS, tname, nargs);
+ return -EFAULT;
+ }
if (nargs > MAX_BPF_FUNC_REG_ARGS) {
- if (!is_global)
- return -EINVAL;
- bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+ if (!bpf_jit_supports_stack_args()) {
+ bpf_log(log, "JIT does not support function %s() with %d args\n",
+ tname, nargs);
+ return -EFAULT;
+ }
+ sub->stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS;
+ }
+
+ if (is_global && nargs > MAX_BPF_FUNC_REG_ARGS) {
+ bpf_log(log, "global function %s has %d > %d args, stack args not supported\n",
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
- /* check that function is void or returns int, exception cb also requires this */
- t = btf_type_by_id(btf, t->type);
- while (btf_type_is_modifier(t))
- t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) {
- if (!is_global)
- return -EINVAL;
- bpf_log(log,
- "Global function %s() return value not void or scalar. "
- "Only those are supported.\n",
- tname);
- return -EINVAL;
+
+ err = btf_validate_return_type(env, btf, t, subprog);
+ if (err) {
+ if (is_global) {
+ bpf_log(log,
+ "Global function %s() return value not void or scalar. "
+ "Only those are supported.\n",
+ tname);
+ }
+ return err;
}
/* Convert BTF function arguments into verifier types.
@@ -7913,42 +8029,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
*/
for (i = 0; i < nargs; i++) {
u32 tags = 0;
- int id = btf_named_start_id(btf, false) - 1;
-
- /* 'arg:<tag>' decl_tag takes precedence over derivation of
- * register type from BTF type itself
- */
- while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) {
- const struct btf_type *tag_t = btf_type_by_id(btf, id);
- const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4;
-
- /* disallow arg tags in static subprogs */
- if (!is_global) {
- bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
- return -EOPNOTSUPP;
- }
+ err = btf_scan_decl_tags(env, btf, fn_t, i, is_global, &tags);
+ if (err)
+ return err;
- if (strcmp(tag, "ctx") == 0) {
- tags |= ARG_TAG_CTX;
- } else if (strcmp(tag, "trusted") == 0) {
- tags |= ARG_TAG_TRUSTED;
- } else if (strcmp(tag, "untrusted") == 0) {
- tags |= ARG_TAG_UNTRUSTED;
- } else if (strcmp(tag, "nonnull") == 0) {
- tags |= ARG_TAG_NONNULL;
- } else if (strcmp(tag, "nullable") == 0) {
- tags |= ARG_TAG_NULLABLE;
- } else if (strcmp(tag, "arena") == 0) {
- tags |= ARG_TAG_ARENA;
- } else {
- bpf_log(log, "arg#%d has unsupported set of tags\n", i);
- return -EOPNOTSUPP;
- }
- }
- if (id != -ENOENT) {
- bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id);
- return id;
- }
+ err = btf_scan_type_tags(env, btf, args[i].type, &tags);
+ if (err)
+ return err;
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
@@ -7973,7 +8060,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
bpf_log(log, "arg#%d has invalid combination of tags\n", i);
return -EINVAL;
}
- sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
+ sub->args[i].arg_type = ARG_PTR_TO_DYNPTR;
continue;
}
if (tags & ARG_TAG_TRUSTED) {
@@ -8074,7 +8161,6 @@ skip_pointer:
return -EINVAL;
}
- sub->arg_cnt = nargs;
sub->args_cached = true;
return 0;
@@ -8196,12 +8282,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log)
{
struct btf *btf;
int ret;
- btf = btf_parse(attr, uattr, uattr_size);
+ btf = btf_parse(attr, uattr, attr_log);
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -8684,6 +8770,39 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
return 0;
}
+static int btf_check_kfunc_name(struct btf *btf, const char *func_name, u32 kind)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+#endif
+ s32 id;
+
+ if (!btf_is_module(btf))
+ return 0;
+
+ id = btf_find_by_name_kind(bpf_get_btf_vmlinux(), func_name, kind);
+ if (id >= 0) {
+ pr_err("kfunc %s (id: %d) is already present in vmlinux.\n",
+ func_name, id);
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ guard(mutex)(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->btf == btf)
+ continue;
+ id = btf_find_by_name_kind(btf_mod->btf, func_name, kind);
+ if (id >= 0) {
+ pr_err("kfunc %s (id: %d) is already present in module %s.\n",
+ func_name, id, btf_mod->module->name);
+ return -EINVAL;
+ }
+ }
+#endif
+ return 0;
+}
+
static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
{
const struct btf_type *func;
@@ -8697,7 +8816,8 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
/* sanity check kfunc name */
func_name = btf_name_by_offset(btf, func->name_off);
- if (!func_name || !func_name[0])
+ if (!func_name || !func_name[0] ||
+ btf_check_kfunc_name(btf, func_name, BTF_INFO_KIND(func->info)))
return -EINVAL;
func = btf_type_by_id(btf, func->type);
diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
index 998f42a8189a..26d37066465f 100644
--- a/kernel/bpf/cfg.c
+++ b/kernel/bpf/cfg.c
@@ -64,11 +64,19 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
subprog->might_sleep = true;
}
+static void mark_subprog_might_throw(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->might_throw = true;
+}
+
/* 't' is an index of a call-site.
* 'w' is a callee entry point.
* Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
* Rely on DFS traversal order and absence of recursive calls to guarantee that
- * callee's change_pkt_data marks would be correct at that moment.
+ * callee's effect marks would be correct at that moment.
*/
static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
{
@@ -78,6 +86,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
callee = bpf_find_containing_subprog(env, w);
caller->changes_pkt_data |= callee->changes_pkt_data;
caller->might_sleep |= callee->might_sleep;
+ caller->might_throw |= callee->might_throw;
}
enum {
@@ -509,6 +518,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
mark_subprog_might_sleep(env, t);
if (ret == 0 && bpf_is_kfunc_pkt_changing(&meta))
mark_subprog_changes_pkt_data(env, t);
+ if (ret == 0 && bpf_is_throw_kfunc(insn))
+ mark_subprog_might_throw(env, t);
}
return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 876f6a81a9b6..83ce66296ac1 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -55,6 +55,28 @@ void __init cgroup_bpf_lifetime_notifier_init(void)
&cgroup_bpf_lifetime_nb));
}
+#ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+ u32 attach_btf_id;
+ int refcnt;
+ bool returns_errno;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
+static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype)
+{
+ if (atype >= CGROUP_LSM_START && atype <= CGROUP_LSM_END)
+ return READ_ONCE(cgroup_lsm_atype[atype - CGROUP_LSM_START].returns_errno);
+ return true;
+}
+#else
+static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype)
+{
+ return true;
+}
+#endif
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -83,7 +105,8 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
*(ret_flags) |= (func_ret >> 1);
func_ret &= 1;
}
- if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
+ if (!func_ret && cgroup_bpf_hook_returns_errno(atype) &&
+ !IS_ERR_VALUE((long)run_ctx.retval))
run_ctx.retval = -EPERM;
item++;
}
@@ -156,13 +179,6 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
}
#ifdef CONFIG_BPF_LSM
-struct cgroup_lsm_atype {
- u32 attach_btf_id;
- int refcnt;
-};
-
-static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
-
static enum cgroup_bpf_attach_type
bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
{
@@ -191,10 +207,13 @@ void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
lockdep_assert_held(&cgroup_mutex);
- WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
- cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
-
- cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+ if (!cgroup_lsm_atype[i].attach_btf_id) {
+ cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+ WRITE_ONCE(cgroup_lsm_atype[i].returns_errno,
+ bpf_lsm_hook_returns_errno(attach_btf_id));
+ } else {
+ WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+ }
cgroup_lsm_atype[i].refcnt++;
}
@@ -203,8 +222,10 @@ void bpf_cgroup_atype_put(int cgroup_atype)
int i = cgroup_atype - CGROUP_LSM_START;
cgroup_lock();
- if (--cgroup_lsm_atype[i].refcnt <= 0)
+ if (--cgroup_lsm_atype[i].refcnt <= 0) {
+ WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, true);
cgroup_lsm_atype[i].attach_btf_id = 0;
+ }
WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
cgroup_unlock();
}
@@ -1208,7 +1229,7 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
/* Must be called with cgroup_mutex held to avoid races. */
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ union bpf_attr __user *uattr, u32 uattr_size)
{
__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
@@ -1259,7 +1280,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return -EFAULT;
if (!effective_query && from_atype == to_atype)
revision = cgrp->bpf.revisions[from_atype];
- if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+ if (uattr_size >= offsetofend(union bpf_attr, query.revision) &&
+ copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
return -EFAULT;
if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
/* return early if user requested only program count + flags */
@@ -1312,12 +1334,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ union bpf_attr __user *uattr, u32 uattr_size)
{
int ret;
cgroup_lock();
- ret = __cgroup_bpf_query(cgrp, attr, uattr);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr, uattr_size);
cgroup_unlock();
return ret;
}
@@ -1520,7 +1542,7 @@ out_put_cgroup:
}
int cgroup_bpf_prog_query(const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ union bpf_attr __user *uattr, u32 uattr_size)
{
struct cgroup *cgrp;
int ret;
@@ -1529,7 +1551,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
- ret = cgroup_bpf_query(cgrp, attr, uattr);
+ ret = cgroup_bpf_query(cgrp, attr, uattr, uattr_size);
cgroup_put(cgrp);
return ret;
@@ -1935,8 +1957,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.cur_val);
- if (ret == 1 && ctx.new_updated) {
- kfree(*buf);
+ if (!ret && ctx.new_updated) {
+ kvfree(*buf);
*buf = ctx.new_val;
*pcount = ctx.new_len;
} else {
@@ -2342,6 +2364,7 @@ BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
return -E2BIG;
memcpy(ctx->new_val, buf, buf_len);
+ ((char *)ctx->new_val)[buf_len] = '\0';
ctx->new_len = buf_len;
ctx->new_updated = 1;
diff --git a/kernel/bpf/cnum.c b/kernel/bpf/cnum.c
new file mode 100644
index 000000000000..86142cb2aee5
--- /dev/null
+++ b/kernel/bpf/cnum.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bits.h>
+
+#define T 32
+#include "cnum_defs.h"
+#undef T
+
+#define T 64
+#include "cnum_defs.h"
+#undef T
+
+struct cnum32 cnum32_from_cnum64(struct cnum64 cnum)
+{
+ if (cnum64_is_empty(cnum))
+ return CNUM32_EMPTY;
+
+ if (cnum.size >= U32_MAX)
+ return (struct cnum32){ .base = 0, .size = U32_MAX };
+ else
+ return (struct cnum32){ .base = (u32)cnum.base, .size = cnum.size };
+}
+
+/*
+ * Suppose 'a' and 'b' are laid out as follows:
+ *
+ * 64-bit number axis --->
+ *
+ * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32
+ * ||------|---|=====|-------||----------|=====|-------||----------|=====|----|--||
+ * | |< b >| |< b >| |< b >| |
+ * | | | |
+ * |<--+--------------------------- a ---------------------------+--->|
+ * | |
+ * |<-------------------------- t -------------------------->|
+ *
+ * In such a case it is possible to infer a more tight representation t
+ * such that ∀ v ∈ a, (u32)v ∈ b: v ∈ t.
+ */
+struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b)
+{
+ /*
+ * To simplify reasoning, rotate the circles so that [virtual] a1 starts
+ * at u32 boundary, b1 represents b in this new frame of reference.
+ */
+ struct cnum32 b1 = { b.base - (u32)a.base, b.size };
+ struct cnum64 t = a;
+ u64 d, b1_max;
+
+ if (cnum64_is_empty(a) || cnum32_is_empty(b))
+ return CNUM64_EMPTY;
+
+ if (cnum32_urange_overflow(b1)) {
+ b1_max = (u32)b1.base + (u32)b1.size; /* overflow here is fine and necessary */
+ if ((u32)a.size > b1_max && (u32)a.size < b1.base) {
+ /*
+ * N*2^32 (N+1)*2^32
+ * ||=====|------------|=====||=====|---------|---|=====||
+ * |b1 ->| |<- b1||b1 ->| | |<- b1|
+ * |<----------------- a1 ------------------>|
+ * |<-------------- t ------------>|<-- d -->| (after adjustment)
+ * ^
+ * b1_max
+ */
+ d = (u32)a.size - b1_max;
+ t.size -= d;
+ } else {
+ /*
+ * No adjustments possible in the following cases:
+ *
+ * ||=====|------------|=====||===|=|-------------|=|===||
+ * |b1 ->| |<- b1||b1 +>| |<+ b1|
+ * |<----------------- a1 ------>| |
+ * |<----------------- (or) a1 ------------------->|
+ */
+ }
+ } else {
+ if (t.size < b1.base)
+ /*
+ * N*2^32 (N+1)*2^32
+ * ||----------|--|=======|--||------>
+ * |<-- a1 -->| |<- b ->|
+ */
+ return CNUM64_EMPTY;
+ /*
+ * N*2^32 (N+1)*2^32
+ * ||-------------|========|-||-----| -------|========|-||
+ * | |<- b1 ->| | |<- b1 ->|
+ * |<------------+ a1 ------------>|
+ * |<------ t ------>| (after adjustment)
+ */
+ t.base += b1.base;
+ t.size -= b1.base;
+ b1_max = b1.base + b1.size;
+ d = 0;
+ if ((u32)a.size < b1.base)
+ /*
+ * N*2^32 (N+1)*2^32
+ * ||-------------|========|-||------|-------|========|-||
+ * | |<- b1 ->| | |<- b1 ->|
+ * |<------------+-- a1 --+-------->|
+ * |<- t ->|<-- d -->| (after adjustment)
+ */
+ d = (u32)a.size + (BIT_ULL(32) - b1_max);
+ else if ((u32)a.size >= b1_max)
+ /*
+ * N*2^32 (N+1)*2^32
+ * ||--|========|------------||--|========|-------|-----||
+ * | |<- b1 ->| |<- b1 ->| |
+ * |<-+------------------ a1 ------------+------>|
+ * |<-------------- t --------------->|<- d ->| (after adjustment)
+ */
+ d = (u32)a.size - b1_max;
+ if (t.size < d)
+ return CNUM64_EMPTY;
+ t.size -= d;
+ }
+ return t;
+}
diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h
new file mode 100644
index 000000000000..a90e317e3578
--- /dev/null
+++ b/kernel/bpf/cnum_defs.h
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#ifndef T
+#error "Define T (bit width: 32, 64) before including cnum_defs.h"
+#endif
+
+#include <linux/cnum.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/minmax.h>
+#include <linux/compiler_types.h>
+
+#define cnum_t __PASTE(cnum, T)
+#define ut __PASTE(u, T)
+#define st __PASTE(s, T)
+#define UT_MAX __PASTE(__PASTE(U, T), _MAX)
+#define ST_MAX __PASTE(__PASTE(S, T), _MAX)
+#define ST_MIN __PASTE(__PASTE(S, T), _MIN)
+#define EMPTY __PASTE(__PASTE(CNUM, T), _EMPTY)
+#define FN(name) __PASTE(__PASTE(cnum, T), __PASTE(_, name))
+
+struct cnum_t FN(from_urange)(ut min, ut max)
+{
+ return (struct cnum_t){ .base = min, .size = (ut)max - min };
+}
+
+struct cnum_t FN(from_srange)(st min, st max)
+{
+ ut size = (ut)max - (ut)min;
+ ut base = size == UT_MAX ? 0 : (ut)min;
+
+ return (struct cnum_t){ .base = base, .size = size };
+}
+
+/* True if this cnum represents two unsigned ranges. */
+static inline bool FN(urange_overflow)(struct cnum_t cnum)
+{
+ /* Same as cnum.base + cnum.size > UT_MAX but avoids overflow */
+ return cnum.size > UT_MAX - (ut)cnum.base;
+}
+
+/*
+ * cnum{T}_umin / cnum{T}_umax query an unsigned range represented by this cnum.
+ * If cnum represents a range crossing the UT_MAX/0 boundary, the unbound range
+ * [0..UT_MAX] is returned.
+ */
+ut FN(umin)(struct cnum_t cnum)
+{
+ return FN(urange_overflow)(cnum) ? 0 : cnum.base;
+}
+EXPORT_SYMBOL_GPL(FN(umin));
+
+ut FN(umax)(struct cnum_t cnum)
+{
+ return FN(urange_overflow)(cnum) ? UT_MAX : cnum.base + cnum.size;
+}
+EXPORT_SYMBOL_GPL(FN(umax));
+
+/* True if this cnum represents two signed ranges. */
+static inline bool FN(srange_overflow)(struct cnum_t cnum)
+{
+ return FN(contains)(cnum, (ut)ST_MAX) && FN(contains)(cnum, (ut)ST_MIN);
+}
+
+/*
+ * cnum{T}_smin / cnum{T}_smax query a signed range represented by this cnum.
+ * If cnum represents a range crossing the ST_MAX/ST_MIN boundary, the unbound range
+ * [ST_MIN..ST_MAX] is returned.
+ */
+st FN(smin)(struct cnum_t cnum)
+{
+ return FN(srange_overflow)(cnum)
+ ? ST_MIN
+ : min((st)cnum.base, (st)(cnum.base + cnum.size));
+}
+
+st FN(smax)(struct cnum_t cnum)
+{
+ return FN(srange_overflow)(cnum)
+ ? ST_MAX
+ : max((st)cnum.base, (st)(cnum.base + cnum.size));
+}
+
+/*
+ * Returns a possibly empty intersection of cnums 'a' and 'b'.
+ * If 'a' and 'b' intersect in two sub-arcs, the function over-approximates
+ * and returns either 'a' or 'b', whichever is smaller.
+ */
+struct cnum_t FN(intersect)(struct cnum_t a, struct cnum_t b)
+{
+ struct cnum_t b1;
+ ut dbase;
+
+ if (FN(is_empty)(a) || FN(is_empty)(b))
+ return EMPTY;
+
+ if (a.base > b.base)
+ swap(a, b);
+
+ /*
+ * Rotate frame of reference such that a.base is 0.
+ * 'b1' is 'b' in this frame of reference.
+ */
+ dbase = b.base - a.base;
+ b1 = (struct cnum_t){ dbase, b.size };
+ if (FN(urange_overflow)(b1)) {
+ if (b1.base <= a.size) {
+ /*
+ * Rotated frame (a.base at origin):
+ *
+ * 0 UT_MAX
+ * |--------------------------------------------|
+ * [=== a ==========================] |
+ * [= b1 tail =] [========= b1 main ==========>]
+ * ^-- b1.base <= a.size
+ *
+ * 'a' and 'b' intersect in two disjoint arcs,
+ * can't represent as single cnum, over-approximate
+ * the result.
+ */
+ return a.size <= b.size ? a : b;
+ } else {
+ /*
+ * Rotated frame (a.base at origin):
+ *
+ * 0 UT_MAX
+ * |--------------------------------------------|
+ * [=== a =============] | |
+ * [= b1 tail =] [======= b1 main ====>]
+ * ^-- b1.base > a.size
+ *
+ * Only 'b' tail intersects 'a'.
+ */
+ return (struct cnum_t) {
+ .base = a.base,
+ .size = min(a.size, (ut)(b1.base + b1.size)),
+ };
+ }
+ } else if (a.size >= b1.base) {
+ /*
+ * Rotated frame (a.base at origin):
+ *
+ * 0 UT_MAX
+ * |--------------------------------------------------|
+ * [=== a ==================================] |
+ * [== b1 =====================]
+ *
+ * 0 UT_MAX
+ * |--------------------------------------------------|
+ * [=== a ==================================] |
+ * [== b1 ====]
+ * ^-- b1.base <= a.size
+ * |<-- a.size - dbase -->|
+ *
+ * 'a' and 'b' intersect as one cnum.
+ */
+ return (struct cnum_t) {
+ .base = b.base,
+ .size = min((ut)(a.size - dbase), b.size),
+ };
+ } else {
+ return EMPTY;
+ }
+}
+
+void FN(intersect_with)(struct cnum_t *dst, struct cnum_t src)
+{
+ *dst = FN(intersect)(*dst, src);
+}
+
+void FN(intersect_with_urange)(struct cnum_t *dst, ut min, ut max)
+{
+ FN(intersect_with)(dst, FN(from_urange)(min, max));
+}
+
+void FN(intersect_with_srange)(struct cnum_t *dst, st min, st max)
+{
+ FN(intersect_with)(dst, FN(from_srange)(min, max));
+}
+
+static inline struct cnum_t FN(normalize)(struct cnum_t cnum)
+{
+ if (cnum.size == UT_MAX && cnum.base != 0 && cnum.base != (ut)ST_MAX)
+ cnum.base = 0;
+ return cnum;
+}
+
+struct cnum_t FN(add)(struct cnum_t a, struct cnum_t b)
+{
+ if (FN(is_empty)(a) || FN(is_empty)(b))
+ return EMPTY;
+ if (a.size > UT_MAX - b.size)
+ return (struct cnum_t){ 0, (ut)UT_MAX };
+ else
+ return FN(normalize)((struct cnum_t){ a.base + b.base, a.size + b.size });
+}
+
+struct cnum_t FN(negate)(struct cnum_t a)
+{
+ if (FN(is_empty)(a))
+ return EMPTY;
+ return FN(normalize)((struct cnum_t){ -((ut)a.base + a.size), a.size });
+}
+
+bool FN(is_empty)(struct cnum_t cnum)
+{
+ return cnum.base == EMPTY.base && cnum.size == EMPTY.size;
+}
+
+bool FN(contains)(struct cnum_t cnum, ut v)
+{
+ if (FN(is_empty)(cnum))
+ return false;
+ if (FN(urange_overflow)(cnum))
+ return v >= cnum.base || v <= (ut)cnum.base + cnum.size;
+ else
+ return v >= cnum.base && v <= (ut)cnum.base + cnum.size;
+}
+
+bool FN(is_const)(struct cnum_t cnum)
+{
+ return cnum.size == 0;
+}
+
+bool FN(is_subset)(struct cnum_t bigger, struct cnum_t smaller)
+{
+ if (FN(is_empty(smaller)))
+ return true;
+ if (FN(is_empty(bigger)))
+ return false;
+ /* rotate both arcs such that 'bigger' starts at origin, hence does not overflow */
+ smaller.base -= bigger.base;
+ bigger.base = 0;
+ if (FN(urange_overflow)(smaller) && bigger.size < UT_MAX)
+ return false;
+ return smaller.base + smaller.size <= bigger.size;
+}
+
+#undef EMPTY
+#undef cnum_t
+#undef ut
+#undef st
+#undef UT_MAX
+#undef ST_MAX
+#undef ST_MIN
+#undef FN
diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c
index db73c4740b1e..b2a19acadb91 100644
--- a/kernel/bpf/const_fold.c
+++ b/kernel/bpf/const_fold.c
@@ -58,6 +58,14 @@ static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info *
u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code);
int r;
+ /* Stack arg stores (r11-based) are outside the tracked register set. */
+ if (is_stack_arg_st(insn) || is_stack_arg_stx(insn))
+ return;
+ if (is_stack_arg_ldx(insn)) {
+ ci_out[insn->dst_reg] = unknown;
+ return;
+ }
+
switch (class) {
case BPF_ALU:
case BPF_ALU64:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8b018ff48875..649cce41e13f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1299,8 +1299,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
u32 imm_rnd = get_random_u32();
s16 off;
- BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
- BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+ BUILD_BUG_ON(BPF_REG_PARAMS + 2 != MAX_BPF_JIT_REG);
+ BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
/* Constraints on AX register:
*
@@ -1582,6 +1582,16 @@ bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struc
insn_idx += prog->aux->subprog_start;
return env->insn_aux_data[insn_idx].indirect_target;
}
+
+u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog)
+{
+ const struct bpf_subprog_info *sub;
+
+ if (!env)
+ return 0;
+ sub = &env->subprog_info[prog->aux->func_idx];
+ return sub->stack_arg_cnt - bpf_in_stack_arg_cnt(sub);
+}
#endif /* CONFIG_BPF_JIT */
/* Base function for offset calculation. Needs to go into .text section,
@@ -1771,6 +1781,9 @@ static u32 abs_s32(s32 x)
return x >= 0 ? (u32)x : -(u32)x;
}
+static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
+ const struct bpf_insn *insn);
+
/**
* ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
@@ -2077,10 +2090,9 @@ select_insn:
CONT;
JMP_CALL_ARGS:
- BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
- BPF_R3, BPF_R4,
- BPF_R5,
- insn + insn->off + 1);
+ BPF_R0 = interpreters_args[insn->off](BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5,
+ insn + insn->imm + 1);
CONT;
JMP_TAIL_CALL: {
@@ -2394,13 +2406,22 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
#undef PROG_NAME_LIST
#ifdef CONFIG_BPF_SYSCALL
-void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
+int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
stack_depth = max_t(u32, stack_depth, 1);
- insn->off = (s16) insn->imm;
- insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
- __bpf_call_base_args;
+ /* Prevent out-of-bounds read to interpreters_args */
+ if (stack_depth > MAX_BPF_STACK)
+ return -EINVAL;
+ insn->off = (round_up(stack_depth, 32) / 32) - 1;
insn->code = BPF_JMP | BPF_CALL_ARGS;
+ return 0;
+}
+
+s32 bpf_call_args_imm(s16 idx)
+{
+ if (WARN_ON_ONCE(idx < 0 || idx >= ARRAY_SIZE(interpreters_args)))
+ return 0;
+ return BPF_CALL_IMM(interpreters_args[idx]);
}
#endif
#endif
@@ -2460,7 +2481,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
cookie = aux->cgroup_storage[i] ?
aux->cgroup_storage[i]->cookie : 0;
ret = map->owner->storage_cookie[i] == cookie ||
- !cookie;
+ (!cookie && !aux->tail_call_reachable);
}
if (ret &&
map->owner->attach_func_proto != aux->attach_func_proto) {
@@ -3217,6 +3238,11 @@ bool __weak bpf_jit_supports_kfunc_call(void)
return false;
}
+bool __weak bpf_jit_supports_stack_args(void)
+{
+ return false;
+}
+
bool __weak bpf_jit_supports_far_kfunc_call(void)
{
return false;
@@ -3352,6 +3378,12 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
}
#ifdef CONFIG_BPF_SYSCALL
+__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+ unsigned long fault_ip)
+{
+ return false;
+}
+
static int __init bpf_global_ma_init(void)
{
int ret;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index cc0a43ebab6b..dc7b859e8bbf 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -581,6 +581,10 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
{
struct xdp_frame *nxdpf;
+ /* Frags live outside the linear frame and cannot be cloned safely. */
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ return -EOPNOTSUPP;
+
nxdpf = xdpf_clone(xdpf);
if (!nxdpf)
return -ENOMEM;
@@ -706,6 +710,18 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
if (unlikely(err))
return err;
+ if (dst->xdp_prog && skb_cloned(skb)) {
+ struct sk_buff *nskb;
+
+ nskb = skb_copy(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ nskb->mac_len = skb->mac_len;
+ consume_skb(skb);
+ skb = nskb;
+ }
+
/* Redirect has already succeeded semantically at this point, so we just
* return 0 even if packet is dropped. Helper below takes care of
* freeing skb.
@@ -726,6 +742,9 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
struct sk_buff *nskb;
int err;
+ if (unlikely(skb_is_nonlinear(skb)))
+ return -EOPNOTSUPP;
+
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c
index fba9e8c00878..3cf2cc6e3ab6 100644
--- a/kernel/bpf/fixups.c
+++ b/kernel/bpf/fixups.c
@@ -870,7 +870,7 @@ int bpf_convert_ctx_accesses(struct bpf_verifier_env *env)
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID | PTR_UNTRUSTED:
/* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
- * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * PTR_TO_BTF_ID, and an active referenced id, but the same cannot
* be said once it is marked PTR_UNTRUSTED, hence we must handle
* any faults for loads into such types. BPF_WRITE is disallowed
* for this case.
@@ -1250,9 +1250,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
if (!bpf_pseudo_call(insn))
continue;
- insn->off = env->insn_aux_data[i].call_imm;
- subprog = bpf_find_subprog(env, i + insn->off + 1);
- insn->imm = subprog;
+ insn->imm = env->insn_aux_data[i].call_imm;
+ subprog = bpf_find_subprog(env, i + insn->imm + 1);
+ insn->off = subprog;
}
prog->jited = 1;
@@ -1265,6 +1265,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->aux->real_func_cnt = env->subprog_cnt;
prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
prog->aux->exception_boundary = func[0]->aux->exception_boundary;
+ prog->aux->stack_arg_sp_adjust = func[0]->aux->stack_arg_sp_adjust;
bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
@@ -1378,9 +1379,21 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env)
struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = prog->insnsi;
bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
- int i, depth;
+ int depth;
#endif
- int err = 0;
+ int i, err = 0;
+
+ for (i = 0; i < env->subprog_cnt; i++) {
+ struct bpf_subprog_info *subprog = &env->subprog_info[i];
+ u16 outgoing = subprog->stack_arg_cnt - bpf_in_stack_arg_cnt(subprog);
+
+ if (subprog->max_out_stack_arg_cnt > outgoing) {
+ verbose(env,
+ "func#%d writes %u stack arg slots, but calls only require %u\n",
+ i, subprog->max_out_stack_arg_cnt, outgoing);
+ return -EINVAL;
+ }
+ }
if (env->prog->jit_requested &&
!bpf_prog_is_offloaded(env->prog->aux)) {
@@ -1395,6 +1408,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env)
verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
return -EINVAL;
}
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (bpf_in_stack_arg_cnt(&env->subprog_info[i])) {
+ verbose(env, "stack args are not supported in non-JITed programs\n");
+ return -EINVAL;
+ }
+ }
if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
/* When JIT fails the progs with bpf2bpf calls and tail_calls
* have to be rejected, since interpreter doesn't support them yet.
@@ -1416,7 +1435,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env)
depth = get_callee_stack_depth(env, insn, i);
if (depth < 0)
return depth;
- bpf_patch_call_args(insn, depth);
+ err = bpf_patch_call_args(insn, depth);
+ if (err) {
+ verbose(env, "stack depth %d exceeds interpreter stack depth limit\n",
+ depth);
+ return err;
+ }
}
err = 0;
#endif
@@ -2162,6 +2186,8 @@ patch_map_ops_generic:
insn->imm == BPF_FUNC_get_func_ret) {
if (eatype == BPF_TRACE_FEXIT ||
eatype == BPF_TRACE_FSESSION ||
+ eatype == BPF_TRACE_FEXIT_MULTI ||
+ eatype == BPF_TRACE_FSESSION_MULTI ||
eatype == BPF_MODIFY_RETURN) {
/* Load nr_args from ctx - 8 */
insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3dd9b4924ae4..9f394e1aa2e8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -9,6 +9,7 @@
#include <linux/rculist_nulls.h>
#include <linux/rcupdate_wait.h>
#include <linux/random.h>
+#include <linux/rhashtable.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/btf_ids.h>
@@ -242,6 +243,10 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
if (IS_ERR_OR_NULL(htab->map.record))
return;
+ /*
+ * Preallocated maps do not have a bpf_mem_alloc destructor, so fully
+ * destroy every element, including the extra elements.
+ */
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
for (i = 0; i < num_entries; i++) {
@@ -496,28 +501,26 @@ static void htab_dtor_ctx_free(void *ctx)
kfree(ctx);
}
-static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
+static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma,
+ void (*dtor)(void *, void *))
{
- u32 key_size = htab->map.key_size;
- struct bpf_mem_alloc *ma;
struct htab_btf_record *hrec;
int err;
/* No need for dtors. */
- if (IS_ERR_OR_NULL(htab->map.record))
+ if (IS_ERR_OR_NULL(map->record))
return 0;
hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
if (!hrec)
return -ENOMEM;
- hrec->key_size = key_size;
- hrec->record = btf_record_dup(htab->map.record);
+ hrec->key_size = map->key_size;
+ hrec->record = btf_record_dup(map->record);
if (IS_ERR(hrec->record)) {
err = PTR_ERR(hrec->record);
kfree(hrec);
return err;
}
- ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
return 0;
}
@@ -534,9 +537,9 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
* populated in htab_map_alloc(), so it will always appear as NULL.
*/
if (htab_is_percpu(htab))
- return htab_set_dtor(htab, htab_pcpu_mem_dtor);
+ return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor);
else
- return htab_set_dtor(htab, htab_mem_dtor);
+ return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor);
}
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
@@ -834,8 +837,8 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
-static void check_and_free_fields(struct bpf_htab *htab,
- struct htab_elem *elem)
+static void check_and_cancel_fields(struct bpf_htab *htab,
+ struct htab_elem *elem)
{
if (IS_ERR_OR_NULL(htab->map.record))
return;
@@ -845,11 +848,11 @@ static void check_and_free_fields(struct bpf_htab *htab,
int cpu;
for_each_possible_cpu(cpu)
- bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ bpf_obj_cancel_fields(&htab->map, per_cpu_ptr(pptr, cpu));
} else {
void *map_value = htab_elem_value(elem, htab->map.key_size);
- bpf_obj_free_fields(htab->map.record, map_value);
+ bpf_obj_cancel_fields(&htab->map, map_value);
}
}
@@ -884,7 +887,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
htab_unlock_bucket(b, flags);
if (l == tgt_l)
- check_and_free_fields(htab, l);
+ check_and_cancel_fields(htab, l);
return l == tgt_l;
}
@@ -949,7 +952,7 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
- check_and_free_fields(htab, l);
+ check_and_cancel_fields(htab, l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
@@ -1002,7 +1005,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
if (htab_is_prealloc(htab)) {
bpf_map_dec_elem_count(&htab->map);
- check_and_free_fields(htab, l);
+ check_and_cancel_fields(htab, l);
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
dec_elem_count(htab);
@@ -1019,7 +1022,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
/* copy true value_size bytes */
ptr = this_cpu_ptr(pptr);
copy_map_value(&htab->map, ptr, value);
- bpf_obj_free_fields(htab->map.record, ptr);
+ bpf_obj_cancel_fields(&htab->map, ptr);
} else {
u32 size = round_up(htab->map.value_size, 8);
void *val;
@@ -1029,7 +1032,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
cpu = map_flags >> 32;
ptr = per_cpu_ptr(pptr, cpu);
copy_map_value(&htab->map, ptr, value);
- bpf_obj_free_fields(htab->map.record, ptr);
+ bpf_obj_cancel_fields(&htab->map, ptr);
return;
}
@@ -1037,7 +1040,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
ptr = per_cpu_ptr(pptr, cpu);
val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
copy_map_value(&htab->map, ptr, val);
- bpf_obj_free_fields(htab->map.record, ptr);
+ bpf_obj_cancel_fields(&htab->map, ptr);
}
}
}
@@ -1253,11 +1256,11 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
- /* l_old has already been stashed in htab->extra_elems, free
- * its special fields before it is available for reuse.
+ /* l_old has already been stashed in htab->extra_elems, cancel
+ * its reusable special fields before it is available for reuse.
*/
if (htab_is_prealloc(htab))
- check_and_free_fields(htab, l_old);
+ check_and_cancel_fields(htab, l_old);
}
htab_unlock_bucket(b, flags);
if (l_old && !htab_is_prealloc(htab))
@@ -1270,7 +1273,7 @@ err:
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
{
- check_and_free_fields(htab, elem);
+ check_and_cancel_fields(htab, elem);
bpf_map_dec_elem_count(&htab->map);
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
@@ -2739,3 +2742,794 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
};
+
+struct rhtab_elem {
+ struct rhash_head node;
+ /* key bytes, then value bytes follow */
+ u8 data[] __aligned(8);
+};
+
+struct bpf_rhtab {
+ struct bpf_map map;
+ struct rhashtable ht;
+ struct bpf_mem_alloc ma;
+ u32 elem_size;
+ bool freeing_internal;
+};
+
+static const struct rhashtable_params rhtab_params = {
+ .head_offset = offsetof(struct rhtab_elem, node),
+ .key_offset = offsetof(struct rhtab_elem, data),
+};
+
+static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size)
+{
+ return l->data + round_up(key_size, 8);
+}
+
+/* Specialize hash function and objcmp for long sized key */
+static __always_inline int rhtab_key_cmp_long(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const unsigned long key1 = *(const unsigned long *)arg->key;
+ const struct rhtab_elem *key2 = ptr;
+
+ return key1 != *(const unsigned long *)key2->data;
+}
+
+static __always_inline u32 rhtab_hashfn_long(const void *data, u32 len, u32 seed)
+{
+ u64 k = *(const unsigned long *)data;
+
+ return (u32)(k ^ (k >> 32)) ^ seed;
+}
+
+static const struct rhashtable_params rhtab_params_long = {
+ .head_offset = offsetof(struct rhtab_elem, node),
+ .key_offset = offsetof(struct rhtab_elem, data),
+ .key_len = sizeof(long),
+ .hashfn = rhtab_hashfn_long,
+ .obj_cmpfn = rhtab_key_cmp_long,
+};
+
+static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr)
+{
+ struct rhashtable_params params;
+ struct bpf_rhtab *rhtab;
+ int err = 0;
+
+ rhtab = bpf_map_area_alloc(sizeof(*rhtab), NUMA_NO_NODE);
+ if (!rhtab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&rhtab->map, attr);
+
+ if (rhtab->map.max_entries > 1UL << 31) {
+ err = -E2BIG;
+ goto free_rhtab;
+ }
+
+ rhtab->elem_size = sizeof(struct rhtab_elem) + round_up(rhtab->map.key_size, 8) +
+ round_up(rhtab->map.value_size, 8);
+
+ params = rhtab_params;
+ params.key_len = rhtab->map.key_size;
+ params.nelem_hint = (u32)attr->map_extra;
+ params.automatic_shrinking = true;
+
+ if (rhtab->map.key_size == sizeof(long)) {
+ params.hashfn = rhtab_hashfn_long;
+ params.obj_cmpfn = rhtab_key_cmp_long;
+ }
+
+ err = rhashtable_init(&rhtab->ht, &params);
+ if (err)
+ goto free_rhtab;
+
+ /* Set max_elems after rhashtable_init() since init zeroes the struct */
+ rhtab->ht.max_elems = rhtab->map.max_entries;
+
+ err = bpf_mem_alloc_init(&rhtab->ma, rhtab->elem_size, false);
+ if (err)
+ goto destroy_rhtab;
+
+ return &rhtab->map;
+
+destroy_rhtab:
+ rhashtable_destroy(&rhtab->ht);
+free_rhtab:
+ bpf_map_area_free(rhtab);
+ return ERR_PTR(err);
+}
+
+static int rhtab_map_alloc_check(union bpf_attr *attr)
+{
+ if (!(attr->map_flags & BPF_F_NO_PREALLOC))
+ return -EINVAL;
+
+ if (attr->map_flags & BPF_F_ZERO_SEED)
+ return -EINVAL;
+
+ if (attr->key_size > U16_MAX)
+ return -E2BIG;
+
+ if (attr->map_extra >> 32)
+ return -EINVAL;
+
+ if ((u32)attr->map_extra > U16_MAX)
+ return -E2BIG;
+
+ if ((u32)attr->map_extra > attr->max_entries)
+ return -EINVAL;
+
+ return htab_map_alloc_check(attr);
+}
+
+static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab,
+ struct rhtab_elem *elem)
+{
+ if (IS_ERR_OR_NULL(rhtab->map.record))
+ return;
+
+ bpf_obj_free_fields(rhtab->map.record,
+ rhtab_elem_value(elem, rhtab->map.key_size));
+}
+
+static void rhtab_mem_dtor(void *obj, void *ctx)
+{
+ struct htab_btf_record *hrec = ctx;
+ struct rhtab_elem *elem = obj;
+
+ if (IS_ERR_OR_NULL(hrec->record))
+ return;
+
+ bpf_obj_free_fields(hrec->record,
+ rhtab_elem_value(elem, hrec->key_size));
+}
+
+static void rhtab_free_elem(void *ptr, void *arg)
+{
+ struct bpf_rhtab *rhtab = arg;
+ struct rhtab_elem *elem = ptr;
+
+ bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size));
+ bpf_mem_cache_free_rcu(&rhtab->ma, elem);
+}
+
+static void rhtab_map_free(struct bpf_map *map)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+
+ rhashtable_free_and_destroy(&rhtab->ht, rhtab_free_elem, rhtab);
+ bpf_mem_alloc_destroy(&rhtab->ma);
+ bpf_map_area_free(rhtab);
+}
+
+static void *rhtab_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+
+ /* Hold RCU lock in case sleepable program calls via gen_lookup */
+ guard(rcu)();
+
+ if (map->key_size == sizeof(long))
+ return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params_long);
+
+ return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params);
+}
+
+static void *rhtab_map_lookup_elem(struct bpf_map *map, void *key) __must_hold(RCU)
+{
+ struct rhtab_elem *l;
+
+ l = rhtab_lookup_elem(map, key);
+ return l ? rhtab_elem_value(l, map->key_size) : NULL;
+}
+
+static void rhtab_read_elem_value(struct bpf_map *map, void *dst, struct rhtab_elem *elem,
+ u64 flags)
+{
+ void *src = rhtab_elem_value(elem, map->key_size);
+
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, dst, src, true);
+ else
+ copy_map_value(map, dst, src);
+}
+
+static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, void *copy,
+ u64 flags)
+{
+ int err;
+
+ /*
+ * disable_instrumentation() mitigates the deadlock for programs running in NMI context.
+ * rhashtable locks bucket with local_irq_save(). Only NMI programs may reenter
+ * rhashtable code, bpf_disable_instrumentation() disables programs running in NMI, except
+ * raw tracepoints, which we don't have in rhashtable.
+ */
+ bpf_disable_instrumentation();
+
+ if (rhtab->map.key_size == sizeof(long))
+ err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params_long);
+ else
+ err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params);
+
+ bpf_enable_instrumentation();
+
+ if (err)
+ return err;
+
+ if (copy) {
+ rhtab_read_elem_value(&rhtab->map, copy, elem, flags);
+ check_and_init_map_value(&rhtab->map, copy);
+ }
+ /* Release internal structs: kptr, bpf_timer, task_work, wq */
+ rhtab_check_and_free_fields(rhtab, elem);
+ bpf_mem_cache_free_rcu(&rhtab->ma, elem);
+ return 0;
+}
+
+
+static long rhtab_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ struct rhtab_elem *elem;
+
+ guard(rcu)();
+
+ elem = rhtab_lookup_elem(map, key);
+ if (!elem)
+ return -ENOENT;
+
+ return rhtab_delete_elem(rhtab, elem, NULL, 0);
+}
+
+static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ struct rhtab_elem *elem;
+ int err;
+
+ err = bpf_map_check_op_flags(map, flags, BPF_F_LOCK);
+ if (err)
+ return err;
+
+ guard(rcu)();
+
+ elem = rhtab_lookup_elem(map, key);
+ if (!elem)
+ return -ENOENT;
+
+ return rhtab_delete_elem(rhtab, elem, value, flags);
+}
+
+static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value,
+ u64 map_flags)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ void *old_val = rhtab_elem_value(elem, map->key_size);
+
+ if (map_flags & BPF_NOEXIST)
+ return -EEXIST;
+
+ if (map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, old_val, value, false);
+ else
+ copy_map_value(map, old_val, value);
+
+ /*
+ * Torn reads: a concurrent reader without BPF_F_LOCK may observe
+ * the value mid-copy. Callers requiring consistent reads must use
+ * BPF_F_LOCK, matching arraymap semantics.
+ *
+ * copy_map_value() skips special-field offsets, so old timers/
+ * kptrs/etc. still sit in the slot. Cancel them after the copy
+ * to match arraymap's update semantics.
+ */
+ rhtab_check_and_free_fields(rhtab, elem);
+ return 0;
+}
+
+static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ struct rhtab_elem *elem, *tmp;
+
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
+ return -EINVAL;
+
+ if ((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
+ return -EINVAL;
+
+ guard(rcu)();
+ elem = rhtab_lookup_elem(map, key);
+ if (elem)
+ return rhtab_map_update_existing(map, elem, value, map_flags);
+
+ if (map_flags & BPF_EXIST)
+ return -ENOENT;
+
+ /*
+ * Reject new insertions while map_release_uref cleanup walks the
+ * table. Without this, new elements could keep triggering rehash
+ * and prevent the walk from terminating.
+ */
+ if (READ_ONCE(rhtab->freeing_internal))
+ return -EBUSY;
+
+ /* Check max_entries limit before inserting new element */
+ if (atomic_read(&rhtab->ht.nelems) >= map->max_entries)
+ return -E2BIG;
+
+ elem = bpf_mem_cache_alloc(&rhtab->ma);
+ if (!elem)
+ return -ENOMEM;
+
+ memcpy(elem->data, key, map->key_size);
+ copy_map_value(map, rhtab_elem_value(elem, map->key_size), value);
+ check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size));
+
+ /* Prevent deadlock for NMI programs attempting to take bucket lock */
+ bpf_disable_instrumentation();
+
+ if (map->key_size == sizeof(long))
+ tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params_long);
+ else
+ tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params);
+
+ bpf_enable_instrumentation();
+
+ if (tmp) {
+ bpf_mem_cache_free(&rhtab->ma, elem);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ return rhtab_map_update_existing(map, tmp, value, map_flags);
+ }
+
+ return 0;
+}
+
+static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ BUILD_BUG_ON(!__same_type(&rhtab_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key)) NULL));
+ *insn++ = BPF_EMIT_CALL(rhtab_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct rhtab_elem, data) + round_up(map->key_size, 8));
+
+ return insn - insn_buf;
+}
+
+static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+
+ return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor);
+}
+
+static void rhtab_map_free_internal_structs(struct bpf_map *map)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ struct rhashtable_iter iter;
+ struct rhtab_elem *elem;
+
+ if (!bpf_map_has_internal_structs(map))
+ return;
+
+ /*
+ * Block new insertions. Once observed, no new growth is triggered,
+ * so any in-flight rehash will drain and the walker is guaranteed
+ * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress,
+ * retry"; do not wait for the worker.
+ */
+ WRITE_ONCE(rhtab->freeing_internal, true);
+
+ rhashtable_walk_enter(&rhtab->ht, &iter);
+ rhashtable_walk_start(&iter);
+
+ while ((elem = rhashtable_walk_next(&iter))) {
+ if (IS_ERR(elem)) {
+ if (PTR_ERR(elem) == -EAGAIN)
+ continue;
+ break;
+ }
+
+ bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size));
+
+ if (need_resched()) { /* Avoid stalls on large maps */
+ rhashtable_walk_stop(&iter);
+ cond_resched();
+ rhashtable_walk_start(&iter);
+ }
+ }
+
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
+ WRITE_ONCE(rhtab->freeing_internal, false);
+}
+
+static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+ __must_hold_shared(RCU)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ struct rhtab_elem *elem;
+
+ elem = rhashtable_next_key(&rhtab->ht, key);
+
+ /* if not found, return the first key */
+ if (PTR_ERR(elem) == -ENOENT)
+ elem = rhashtable_next_key(&rhtab->ht, NULL);
+
+ if (IS_ERR(elem))
+ return PTR_ERR(elem);
+ if (!elem)
+ return -ENOENT;
+
+ memcpy(next_key, elem->data, map->key_size);
+ return 0;
+}
+
+static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m)
+{
+ void *value;
+
+ /* Guarantee that hashtab value is not freed */
+ guard(rcu)();
+
+ value = rhtab_map_lookup_elem(map, key);
+ if (!value)
+ return;
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ seq_puts(m, ": ");
+ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+ seq_putc(m, '\n');
+}
+
+static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ void *prev_key = NULL;
+ struct rhtab_elem *elem;
+ int num_elems = 0;
+ u64 ret = 0;
+
+ cant_migrate();
+
+ if (flags != 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ /*
+ * Best-effort iteration: if rhashtable is concurrently resized or
+ * elements are deleted/inserted, there may be missed or duplicate
+ * elements visited.
+ */
+ while ((elem = rhashtable_next_key(&rhtab->ht, prev_key))) {
+ if (IS_ERR(elem))
+ break;
+ num_elems++;
+ ret = callback_fn((u64)(long)map,
+ (u64)(long)elem->data,
+ (u64)(long)rhtab_elem_value(elem, map->key_size),
+ (u64)(long)callback_ctx, 0);
+ if (ret)
+ break;
+
+ prev_key = elem->data; /* valid while RCU held */
+ }
+ rcu_read_unlock();
+
+ return num_elems;
+}
+
+static u64 rhtab_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ u64 num_entries;
+
+ /* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */
+ num_entries = atomic_read(&rhtab->ht.nelems);
+ return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries;
+}
+
+static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+ void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val;
+ struct rhtab_elem **del_elems = NULL;
+ u32 max_count, total, key_size, value_size, i;
+ bool has_next_cursor = false;
+ struct rhtab_elem *elem;
+ u64 elem_map_flags, map_flags;
+ int ret = 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK);
+ if (ret)
+ return ret;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ key_size = map->key_size;
+ value_size = map->value_size;
+
+ keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN);
+ if (do_delete)
+ del_elems = kvmalloc_array(max_count, sizeof(void *),
+ GFP_USER | __GFP_NOWARN);
+ cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
+
+ if (!keys || !values || !cursor || (do_delete && !del_elems)) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ if (ubatch && copy_from_user(cursor, ubatch, key_size)) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ dst_key = keys;
+ dst_val = values;
+ total = 0;
+
+ rcu_read_lock();
+
+ /*
+ * Cursor stores the key of the next-to-process element (stashed by
+ * the previous batch). Look it up directly so the element is included
+ * here rather than skipped by next_key(). If the cursor was deleted
+ * concurrently (or by the previous do_delete batch), return -EAGAIN
+ * so userspace can distinguish a lost cursor from end-of-iteration
+ * (-ENOENT) and restart from a NULL cursor.
+ */
+ if (ubatch) {
+ elem = rhtab_lookup_elem(map, cursor);
+ if (!elem) {
+ rcu_read_unlock();
+ ret = -EAGAIN;
+ goto free;
+ }
+ } else {
+ elem = rhashtable_next_key(&rhtab->ht, NULL);
+ }
+
+ while (elem && !IS_ERR(elem) && total < max_count) {
+ memcpy(dst_key, elem->data, key_size);
+ rhtab_read_elem_value(map, dst_val, elem, elem_map_flags);
+ check_and_init_map_value(map, dst_val);
+
+ if (do_delete)
+ del_elems[total] = elem;
+
+ elem = rhashtable_next_key(&rhtab->ht, dst_key);
+ dst_key += key_size;
+ dst_val += value_size;
+ total++;
+
+ /* Bail to userspace to avoid stalls. */
+ if (need_resched())
+ break;
+ }
+
+ if (elem && !IS_ERR(elem)) {
+ /* Stash next-to-process key as cursor for the next batch. */
+ memcpy(cursor, elem->data, key_size);
+ has_next_cursor = true;
+ }
+
+ if (do_delete) {
+ for (i = 0; i < total; i++)
+ rhtab_delete_elem(rhtab, del_elems[i], NULL, 0);
+ }
+
+ rcu_read_unlock();
+
+ if (total == 0) {
+ ret = -ENOENT;
+ goto free;
+ }
+
+ /* No more elements after this batch. */
+ if (!has_next_cursor)
+ ret = -ENOENT;
+
+ if (copy_to_user(ukeys, keys, (size_t)total * key_size) ||
+ copy_to_user(uvalues, values, (size_t)total * value_size) ||
+ put_user(total, &uattr->batch.count) ||
+ (has_next_cursor &&
+ copy_to_user(u64_to_user_ptr(attr->batch.out_batch),
+ cursor, key_size))) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+free:
+ kfree(cursor);
+ kvfree(keys);
+ kvfree(values);
+ kvfree(del_elems);
+ return ret;
+}
+
+static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false);
+}
+
+static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true);
+}
+
+struct bpf_iter_seq_rhash_map_info {
+ struct bpf_map *map;
+ struct bpf_rhtab *rhtab;
+ struct rhashtable_iter iter;
+};
+
+static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct bpf_iter_seq_rhash_map_info *info = seq->private;
+ struct rhtab_elem *elem;
+
+ rhashtable_walk_start(&info->iter);
+ /*
+ * Re-deliver the element returned by walk_next() at the end of the
+ * previous read() — bpf_seq_read may have stopped before show()
+ * consumed it. Rehash rewinds the walker; retry on -EAGAIN.
+ */
+ do {
+ elem = rhashtable_walk_peek(&info->iter);
+ } while (PTR_ERR(elem) == -EAGAIN);
+
+ if (IS_ERR(elem))
+ return NULL;
+
+ if (elem && *pos == 0)
+ ++*pos;
+ return elem;
+}
+
+static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_rhash_map_info *info = seq->private;
+ struct rhtab_elem *elem;
+
+ ++*pos;
+
+ /* Rehash rewinds the walker; retry until it stops returning -EAGAIN. */
+ do {
+ elem = rhashtable_walk_next(&info->iter);
+ } while (PTR_ERR(elem) == -EAGAIN);
+
+ if (IS_ERR(elem))
+ return NULL;
+ return elem;
+}
+
+static int __bpf_rhash_map_seq_show(struct seq_file *seq,
+ struct rhtab_elem *elem)
+{
+ struct bpf_iter_seq_rhash_map_info *info = seq->private;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, elem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ ctx.key = elem->data;
+ ctx.value = rhtab_elem_value(elem, info->map->key_size);
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
+}
+
+static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_rhash_map_seq_show(seq, v);
+}
+
+static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ struct bpf_iter_seq_rhash_map_info *info = seq->private;
+
+ if (!v)
+ (void)__bpf_rhash_map_seq_show(seq, NULL);
+
+ rhashtable_walk_stop(&info->iter);
+}
+
+static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_rhash_map_info *info = priv_data;
+ struct bpf_map *map = aux->map;
+
+ bpf_map_inc_with_uref(map);
+ info->map = map;
+ info->rhtab = container_of(map, struct bpf_rhtab, map);
+ rhashtable_walk_enter(&info->rhtab->ht, &info->iter);
+ return 0;
+}
+
+static void bpf_iter_fini_rhash_map(void *priv_data)
+{
+ struct bpf_iter_seq_rhash_map_info *info = priv_data;
+
+ rhashtable_walk_exit(&info->iter);
+ bpf_map_put_with_uref(info->map);
+}
+
+static const struct seq_operations bpf_rhash_map_seq_ops = {
+ .start = bpf_rhash_map_seq_start,
+ .next = bpf_rhash_map_seq_next,
+ .stop = bpf_rhash_map_seq_stop,
+ .show = bpf_rhash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info rhash_iter_seq_info = {
+ .seq_ops = &bpf_rhash_map_seq_ops,
+ .init_seq_private = bpf_iter_init_rhash_map,
+ .fini_seq_private = bpf_iter_fini_rhash_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info),
+};
+
+BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab)
+const struct bpf_map_ops rhtab_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = rhtab_map_alloc_check,
+ .map_alloc = rhtab_map_alloc,
+ .map_free = rhtab_map_free,
+ .map_get_next_key = rhtab_map_get_next_key,
+ .map_release_uref = rhtab_map_free_internal_structs,
+ .map_check_btf = rhtab_map_check_btf,
+ .map_lookup_elem = rhtab_map_lookup_elem,
+ .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem,
+ .map_update_elem = rhtab_map_update_elem,
+ .map_delete_elem = rhtab_map_delete_elem,
+ .map_gen_lookup = rhtab_map_gen_lookup,
+ .map_seq_show_elem = rhtab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_each_rhash_elem,
+ .map_mem_usage = rhtab_map_mem_usage,
+ BATCH_OPS(rhtab),
+ .map_btf_id = &rhtab_map_btf_ids[0],
+ .iter_seq_info = &rhash_iter_seq_info,
+};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 2bb60200c266..8e196c9b7c50 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1944,7 +1944,7 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg3_type = ARG_PTR_TO_DYNPTR,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -2001,7 +2001,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
.func = bpf_dynptr_write,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg1_type = ARG_PTR_TO_DYNPTR,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
@@ -2044,7 +2044,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
.func = bpf_dynptr_data,
.gpl_only = false,
.ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
- .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg1_type = ARG_PTR_TO_DYNPTR,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
};
@@ -2247,10 +2247,11 @@ EXPORT_SYMBOL_GPL(bpf_base_func_proto);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
{
- struct list_head *head = list_head, *orig_head = list_head;
+ struct list_head *head = list_head, drain, *pos, *n;
BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+ INIT_LIST_HEAD(&drain);
/* Do the actual list draining outside the lock to not hold the lock for
* too long, and also prevent deadlocks if tracing programs end up
@@ -2261,20 +2262,30 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
__bpf_spin_lock_irqsave(spin_lock);
if (!head->next || list_empty(head))
goto unlock;
- head = head->next;
+ list_for_each_safe(pos, n, head) {
+ struct bpf_list_node_kern *node;
+
+ node = container_of(pos, struct bpf_list_node_kern, list_head);
+ WRITE_ONCE(node->owner, BPF_PTR_POISON);
+ list_move_tail(pos, &drain);
+ }
unlock:
- INIT_LIST_HEAD(orig_head);
+ INIT_LIST_HEAD(head);
__bpf_spin_unlock_irqrestore(spin_lock);
- while (head != orig_head) {
- void *obj = head;
+ while (!list_empty(&drain)) {
+ struct bpf_list_node_kern *node;
- obj -= field->graph_root.node_offset;
- head = head->next;
+ pos = drain.next;
+ node = container_of(pos, struct bpf_list_node_kern, list_head);
+ list_del_init(pos);
+ /* Ensure __bpf_list_add() sees the node as unlinked. */
+ smp_store_release(&node->owner, NULL);
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
- __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+ __bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset,
+ field->graph_root.value_rec, false);
}
}
@@ -2295,6 +2306,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
struct bpf_spin_lock *spin_lock)
{
struct rb_root_cached orig_root, *root = rb_root;
+ struct bpf_rb_node_kern *node;
struct rb_node *pos, *n;
void *obj;
@@ -2303,14 +2315,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
__bpf_spin_lock_irqsave(spin_lock);
orig_root = *root;
+ bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+ node = rb_entry(pos, struct bpf_rb_node_kern, rb_node);
+ WRITE_ONCE(node->owner, BPF_PTR_POISON);
+ }
*root = RB_ROOT_CACHED;
__bpf_spin_unlock_irqrestore(spin_lock);
bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
obj = pos;
obj -= field->graph_root.node_offset;
-
-
+ node = rb_entry(pos, struct bpf_rb_node_kern, rb_node);
+ RB_CLEAR_NODE(pos);
+ /* Ensure __bpf_rbtree_add() sees the node as unlinked. */
+ smp_store_release(&node->owner, NULL);
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
}
}
@@ -2467,9 +2485,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
static int __bpf_list_add(struct bpf_list_node_kern *node,
struct bpf_list_head *head,
- bool tail, struct btf_record *rec, u64 off)
+ struct list_head **prev_ptr,
+ struct btf_record *rec, u64 off)
{
struct list_head *n = &node->list_head, *h = (void *)head;
+ struct list_head *prev;
/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
* called on its fields, so init here
@@ -2477,19 +2497,31 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
if (unlikely(!h->next))
INIT_LIST_HEAD(h);
+ prev = *prev_ptr;
+
+ /* When prev is not the list head, it must be a node in this list. */
+ if (prev != h) {
+ struct bpf_list_node_kern *prev_kn =
+ container_of(prev, struct bpf_list_node_kern, list_head);
+
+ if (unlikely(READ_ONCE(prev_kn->owner) != head))
+ goto fail;
+ }
+
/* node->owner != NULL implies !list_empty(n), no need to separately
* check the latter
*/
- if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
- /* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl((void *)n - off, rec, false);
- return -EINVAL;
- }
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON))
+ goto fail;
- tail ? list_add_tail(n, h) : list_add(n, h);
+ list_add(n, prev);
WRITE_ONCE(node->owner, head);
-
return 0;
+
+fail:
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
+ return -EINVAL;
}
/**
@@ -2510,8 +2542,9 @@ __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
u64 off)
{
struct bpf_list_node_kern *n = (void *)node;
+ struct list_head *h = (void *)head;
- return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+ return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off);
}
__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
@@ -2539,8 +2572,9 @@ __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
u64 off)
{
struct bpf_list_node_kern *n = (void *)node;
+ struct list_head *h = (void *)head;
- return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
+ return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off);
}
__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
@@ -2550,37 +2584,63 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
return bpf_list_push_back(head, node, meta__ign, off);
}
-static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+__bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new,
+ struct bpf_list_node *prev__nonown_allowed,
+ struct btf_struct_meta *meta, u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed;
+ struct list_head *prev_ptr = &p->list_head;
+
+ return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head,
+ struct list_head *n)
{
- struct list_head *n, *h = (void *)head;
+ struct list_head *h = (void *)head;
struct bpf_list_node_kern *node;
/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
* called on its fields, so init here
*/
- if (unlikely(!h->next))
+ if (unlikely(!h->next)) {
INIT_LIST_HEAD(h);
+ return NULL;
+ }
if (list_empty(h))
return NULL;
- n = tail ? h->prev : h->next;
node = container_of(n, struct bpf_list_node_kern, list_head);
- if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+ if (unlikely(READ_ONCE(node->owner) != head))
return NULL;
list_del_init(n);
- WRITE_ONCE(node->owner, NULL);
+ /* Ensure __bpf_list_add() sees the node as unlinked. */
+ smp_store_release(&node->owner, NULL);
return (struct bpf_list_node *)n;
}
__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
{
- return __bpf_list_del(head, false);
+ struct list_head *h = (void *)head;
+
+ return __bpf_list_del(head, h->next);
}
__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
{
- return __bpf_list_del(head, true);
+ struct list_head *h = (void *)head;
+
+ return __bpf_list_del(head, h->prev);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head,
+ struct bpf_list_node *node__nonown_allowed)
+{
+ struct bpf_list_node_kern *kn = (void *)node__nonown_allowed;
+
+ /* verifier guarantees node is a list node rather than list head */
+ return __bpf_list_del(head, &kn->list_head);
}
__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
@@ -2603,6 +2663,43 @@ __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
return (struct bpf_list_node *)h->prev;
}
+__bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head,
+ struct bpf_list_node *node__nonown_allowed)
+{
+ struct list_head *h = (struct list_head *)head;
+ struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
+
+ if (READ_ONCE(kn->owner) != head)
+ return false;
+
+ return list_is_first(&kn->list_head, h);
+}
+
+__bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head,
+ struct bpf_list_node *node__nonown_allowed)
+{
+ struct list_head *h = (struct list_head *)head;
+ struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
+
+ if (READ_ONCE(kn->owner) != head)
+ return false;
+
+ return list_is_last(&kn->list_head, h);
+}
+
+__bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+
+ return list_empty(h);
+}
+
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
struct bpf_rb_node *node)
{
@@ -2912,11 +3009,13 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
{
struct task_struct *p;
- rcu_read_lock();
+ guard(rcu)();
+ if (!task_active_pid_ns(current))
+ return NULL;
+
p = find_task_by_vpid(vpid);
if (p)
p = bpf_task_acquire(p);
- rcu_read_unlock();
return p;
}
@@ -3072,7 +3171,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk);
}
-__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
+__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr *p, u64 start, u64 end)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
u64 size;
@@ -3093,14 +3192,14 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end
__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
{
- struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
return !ptr->data;
}
__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
{
- struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
if (!ptr->data)
return false;
@@ -3110,7 +3209,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
{
- struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
if (!ptr->data)
return -EINVAL;
@@ -3122,7 +3221,7 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
struct bpf_dynptr *clone__uninit)
{
struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
- struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
if (!ptr->data) {
bpf_dynptr_set_null(clone);
@@ -3145,11 +3244,11 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
* Copies data from source dynptr to destination dynptr.
* Returns 0 on success; negative error, otherwise.
*/
-__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
- struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
+__bpf_kfunc int bpf_dynptr_copy(const struct bpf_dynptr *dst_ptr, u64 dst_off,
+ const struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
{
- struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
- struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
+ const struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
+ const struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
void *src_slice, *dst_slice;
char buf[256];
u64 off;
@@ -3200,9 +3299,9 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
* at @offset with the constant byte @val.
* Returns 0 on success; negative error, otherwise.
*/
-__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
+__bpf_kfunc int bpf_dynptr_memset(const struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
{
- struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
u64 chunk_sz, write_off;
char buf[256];
void* slice;
@@ -3301,7 +3400,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
* which skips compiler generated instrumentation to do the same.
*/
kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
- ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp + ctx.aux->stack_arg_sp_adjust, ctx.bp, 0, 0);
WARN(1, "A call to BPF exception callback should never return\n");
}
@@ -4214,13 +4313,13 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
*
* Return: 0 on success, a negative value on error.
*/
-__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
- struct bpf_dynptr *sig_p,
+__bpf_kfunc int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p,
+ const struct bpf_dynptr *sig_p,
struct bpf_key *trusted_keyring)
{
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
- struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
- struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
+ const struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ const struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
const void *data, *sig;
u32 data_len, sig_len;
int ret;
@@ -4241,8 +4340,13 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
data_len = __bpf_dynptr_size(data_ptr);
data = __bpf_dynptr_data(data_ptr, data_len);
+ if (!data)
+ return -EINVAL;
+
sig_len = __bpf_dynptr_size(sig_ptr);
sig = __bpf_dynptr_data(sig_ptr, sig_len);
+ if (!sig)
+ return -EINVAL;
return verify_pkcs7_signature(data, data_len, sig, sig_len,
trusted_keyring->key,
@@ -4713,10 +4817,15 @@ BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
+BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_is_first)
+BTF_ID_FLAGS(func, bpf_list_is_last)
+BTF_ID_FLAGS(func, bpf_list_empty)
BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
@@ -4857,7 +4966,7 @@ BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_dynptr_from_file)
-BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
+BTF_ID_FLAGS(func, bpf_dynptr_file_discard, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_timer_cancel_async)
BTF_KFUNCS_END(common_btf_ids)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 25c06a011825..7837968c0842 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -21,6 +21,9 @@
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/kstrtox.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
#include "preload/bpf_preload.h"
enum bpf_type {
@@ -30,6 +33,23 @@ enum bpf_type {
BPF_TYPE_LINK,
};
+struct bpf_fs_inode {
+ struct list_head xattrs;
+ struct simple_xattr_limits xlimits;
+ struct inode vfs_inode;
+};
+
+static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode)
+{
+ return container_of(inode, struct bpf_fs_inode, vfs_inode);
+}
+
+static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init;
+
+static int bpf_fs_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array, void *fs_info);
+static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size);
+
static void *bpf_any_get(void *raw, enum bpf_type type)
{
switch (type) {
@@ -94,10 +114,17 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
}
static const struct inode_operations bpf_dir_iops;
+static const struct inode_operations bpf_symlink_iops;
-static const struct inode_operations bpf_prog_iops = { };
-static const struct inode_operations bpf_map_iops = { };
-static const struct inode_operations bpf_link_iops = { };
+static const struct inode_operations bpf_prog_iops = {
+ .listxattr = bpf_fs_listxattr,
+};
+static const struct inode_operations bpf_map_iops = {
+ .listxattr = bpf_fs_listxattr,
+};
+static const struct inode_operations bpf_link_iops = {
+ .listxattr = bpf_fs_listxattr,
+};
struct inode *bpf_get_inode(struct super_block *sb,
const struct inode *dir,
@@ -153,11 +180,19 @@ static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
+ int ret;
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
if (IS_ERR(inode))
return ERR_CAST(inode);
+ ret = security_inode_init_security(inode, dir, &dentry->d_name,
+ bpf_fs_initxattrs, NULL);
+ if (ret && ret != -EOPNOTSUPP) {
+ iput(inode);
+ return ERR_PTR(ret);
+ }
+
inode->i_op = &bpf_dir_iops;
inode->i_fop = &simple_dir_operations;
@@ -330,10 +365,20 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
const struct file_operations *fops)
{
struct inode *dir = dentry->d_parent->d_inode;
- struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
+ struct inode *inode;
+ int ret;
+
+ inode = bpf_get_inode(dir->i_sb, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
+ ret = security_inode_init_security(inode, dir, &dentry->d_name,
+ bpf_fs_initxattrs, NULL);
+ if (ret && ret != -EOPNOTSUPP) {
+ iput(inode);
+ return ret;
+ }
+
inode->i_op = iops;
inode->i_fop = fops;
inode->i_private = raw;
@@ -382,9 +427,11 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *target)
{
- char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
struct inode *inode;
+ char *link;
+ int ret;
+ link = kstrdup(target, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!link)
return -ENOMEM;
@@ -394,13 +441,25 @@ static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
return PTR_ERR(inode);
}
- inode->i_op = &simple_symlink_inode_operations;
+ inode->i_op = &bpf_symlink_iops;
inode->i_link = link;
+ ret = security_inode_init_security(inode, dir, &dentry->d_name,
+ bpf_fs_initxattrs, NULL);
+ if (ret && ret != -EOPNOTSUPP) {
+ iput(inode);
+ return ret;
+ }
+
bpf_dentry_finalize(dentry, inode, dir);
return 0;
}
+static const struct inode_operations bpf_symlink_iops = {
+ .get_link = simple_get_link,
+ .listxattr = bpf_fs_listxattr,
+};
+
static const struct inode_operations bpf_dir_iops = {
.lookup = bpf_lookup,
.mkdir = bpf_mkdir,
@@ -409,6 +468,7 @@ static const struct inode_operations bpf_dir_iops = {
.rename = simple_rename,
.link = simple_link,
.unlink = simple_unlink,
+ .listxattr = bpf_fs_listxattr,
};
/* pin iterator link into bpffs */
@@ -762,22 +822,151 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static struct inode *bpf_fs_alloc_inode(struct super_block *sb)
+{
+ struct bpf_fs_inode *bi;
+
+ bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL);
+ if (!bi)
+ return NULL;
+ INIT_LIST_HEAD_RCU(&bi->xattrs);
+ simple_xattr_limits_init(&bi->xlimits);
+ return &bi->vfs_inode;
+}
+
static void bpf_destroy_inode(struct inode *inode)
{
+ struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
+ struct bpf_fs_inode *bi = BPF_FS_I(inode);
enum bpf_type type;
- if (S_ISLNK(inode->i_mode))
- kfree(inode->i_link);
if (!bpf_inode_type(inode, &type))
bpf_any_put(inode->i_private, type);
- free_inode_nonrcu(inode);
+ simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL);
+}
+
+/*
+ * Called after RCU grace period - safe to free inode and anything
+ * that might be accessed by RCU pathwalk (inode fields, i_link).
+ */
+static void bpf_free_inode(struct inode *inode)
+{
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
+ kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode));
+}
+
+static int bpf_fs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
+ struct bpf_fs_inode *bi = BPF_FS_I(inode);
+
+ name = xattr_full_name(handler, name);
+ return simple_xattr_get(&opts->xa_cache, &bi->xattrs, name, value, size);
+}
+
+enum {
+ BPF_FS_XATTR_UNSPEC,
+ BPF_FS_XATTR_SECURITY,
+ BPF_FS_XATTR_TRUSTED,
+};
+
+static int bpf_fs_xattr_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap, struct dentry *unused,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
+ struct bpf_fs_inode *bi = BPF_FS_I(inode);
+ struct simple_xattr *old;
+ int err = -EINVAL;
+
+ name = xattr_full_name(handler, name);
+ switch (handler->flags) {
+ case BPF_FS_XATTR_SECURITY:
+ err = simple_xattr_set_limited(&opts->xa_cache, &bi->xattrs,
+ &bi->xlimits, name, value, size,
+ flags);
+ break;
+ case BPF_FS_XATTR_TRUSTED:
+ old = simple_xattr_set(&opts->xa_cache, &bi->xattrs, name,
+ value, size, flags);
+ err = IS_ERR(old) ? PTR_ERR(old) : 0;
+ if (!err)
+ simple_xattr_free_rcu(old);
+ break;
+ }
+ if (err)
+ return err;
+ inode_set_ctime_current(inode);
+ return 0;
+}
+
+static const struct xattr_handler bpf_fs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .flags = BPF_FS_XATTR_TRUSTED,
+ .get = bpf_fs_xattr_get,
+ .set = bpf_fs_xattr_set,
+};
+
+static const struct xattr_handler bpf_fs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .flags = BPF_FS_XATTR_SECURITY,
+ .get = bpf_fs_xattr_get,
+ .set = bpf_fs_xattr_set,
+};
+
+static const struct xattr_handler * const bpf_fs_xattr_handlers[] = {
+ &bpf_fs_trusted_xattr_handler,
+ &bpf_fs_security_xattr_handler,
+ NULL,
+};
+
+static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+
+ return simple_xattr_list(inode, &BPF_FS_I(inode)->xattrs, buf, size);
+}
+
+static int bpf_fs_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array, void *fs_info)
+{
+ struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
+ struct bpf_fs_inode *bi = BPF_FS_I(inode);
+ const struct xattr *xattr;
+ int err;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len);
+ if (IS_ERR(new_xattr))
+ return PTR_ERR(new_xattr);
+
+ new_xattr->name = kasprintf(GFP_KERNEL_ACCOUNT,
+ XATTR_SECURITY_PREFIX "%s",
+ xattr->name);
+ if (!new_xattr->name)
+ return -ENOMEM;
+
+ err = simple_xattr_add_limited(&opts->xa_cache, &bi->xattrs,
+ &bi->xlimits, new_xattr);
+ if (err)
+ return err;
+
+ retain_and_null_ptr(new_xattr);
+ }
+ return 0;
}
const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = inode_just_drop,
.show_options = bpf_show_options,
+ .alloc_inode = bpf_fs_alloc_inode,
.destroy_inode = bpf_destroy_inode,
+ .free_inode = bpf_free_inode,
};
enum {
@@ -996,25 +1185,38 @@ out:
static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
- static const struct tree_descr bpf_rfiles[] = { { "" } };
struct bpf_mount_opts *opts = sb->s_fs_info;
struct inode *inode;
- int ret;
/* Mounting an instance of BPF FS requires privileges */
if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
- if (ret)
- return ret;
-
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
+ sb->s_magic = BPF_FS_MAGIC;
sb->s_op = &bpf_super_ops;
+ sb->s_xattr = bpf_fs_xattr_handlers;
+ sb->s_iflags |= SB_I_NOEXEC;
+ sb->s_iflags |= SB_I_NODEV;
+ sb->s_time_gran = 1;
- inode = sb->s_root->d_inode;
+ inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_ino = 1;
+ inode->i_op = &bpf_dir_iops;
+ inode->i_fop = &simple_dir_operations;
+ set_nlink(inode, 2);
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ inode = d_inode(sb->s_root);
inode->i_uid = opts->uid;
inode->i_gid = opts->gid;
- inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
populate_bpffs(sb->s_root);
inode->i_mode |= S_ISVTX | opts->mode;
@@ -1068,6 +1270,7 @@ static void bpf_kill_super(struct super_block *sb)
struct bpf_mount_opts *opts = sb->s_fs_info;
kill_anon_super(sb);
+ simple_xattr_cache_cleanup(&opts->xa_cache);
kfree(opts);
}
@@ -1080,18 +1283,37 @@ static struct file_system_type bpf_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
+static void bpf_fs_inode_init_once(void *foo)
+{
+ struct bpf_fs_inode *bi = foo;
+
+ inode_init_once(&bi->vfs_inode);
+}
+
static int __init bpf_init(void)
{
int ret;
+ bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache",
+ sizeof(struct bpf_fs_inode),
+ 0, SLAB_ACCOUNT,
+ bpf_fs_inode_init_once);
+ if (!bpf_fs_inode_cachep)
+ return -ENOMEM;
+
ret = sysfs_create_mount_point(fs_kobj, "bpf");
if (ret)
- return ret;
+ goto out_cache;
ret = register_filesystem(&bpf_fs_type);
- if (ret)
+ if (ret) {
sysfs_remove_mount_point(fs_kobj, "bpf");
+ goto out_cache;
+ }
+ return 0;
+out_cache:
+ kmem_cache_destroy(bpf_fs_inode_cachep);
return ret;
}
fs_initcall(bpf_init);
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 332e6e003f27..0aadfbae0acc 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -610,6 +610,21 @@ enum arg_track_state {
/* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */
#define MAX_ARG_SPILL_SLOTS 64
+/*
+ * Combined register + stack arg tracking: R0-R10 at indices 0-10,
+ * outgoing stack arg slots at indices MAX_BPF_REG..MAX_BPF_REG+6.
+ */
+#define MAX_AT_TRACK_REGS (MAX_BPF_REG + MAX_STACK_ARG_SLOTS)
+
+static int stack_arg_off_to_slot(s16 off)
+{
+ int aoff = off < 0 ? -off : off;
+
+ if (aoff / 8 > MAX_STACK_ARG_SLOTS)
+ return -1;
+ return aoff / 8 - 1;
+}
+
static bool arg_is_visited(const struct arg_track *at)
{
return at->frame != ARG_UNVISITED;
@@ -791,7 +806,9 @@ static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, in
return true;
verbose(env, "arg JOIN insn %d -> %d ", idx, target);
- if (r >= 0)
+ if (r >= MAX_BPF_REG)
+ verbose(env, "sa%d: ", r - MAX_BPF_REG);
+ else if (r >= 0)
verbose(env, "r%d: ", r);
else
verbose(env, "fp%+d: ", r * 8);
@@ -1032,6 +1049,21 @@ static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, i
verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]);
verbose(env, " -> "); verbose_arg_track(env, &at_out[i]);
}
+ /* Log outgoing stack arg slot transitions at indices MAX_BPF_REG..MAX_AT_TRACK_REGS-1 */
+ for (i = 0; i < MAX_STACK_ARG_SLOTS; i++) {
+ int ai = MAX_BPF_REG + i;
+
+ if (arg_track_eq(&at_out[ai], &at_in[ai]))
+ continue;
+ if (!printed) {
+ verbose(env, "%3d: ", idx);
+ bpf_verbose_insn(env, insn);
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+ printed = true;
+ }
+ verbose(env, "\tsa%d: ", i); verbose_arg_track(env, &at_in[ai]);
+ verbose(env, " -> "); verbose_arg_track(env, &at_out[ai]);
+ }
for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
if (arg_track_eq(&at_stack_out[i], &at_stack_in[i]))
continue;
@@ -1062,6 +1094,7 @@ static bool can_be_local_fp(int depth, int regno, struct arg_track *at)
static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn,
int insn_idx,
struct arg_track *at_out, struct arg_track *at_stack_out,
+ const struct arg_track *at_stack_arg_entry,
struct func_instance *instance,
u32 *callsites)
{
@@ -1071,9 +1104,21 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct arg_track *dst = &at_out[insn->dst_reg];
struct arg_track *src = &at_out[insn->src_reg];
struct arg_track none = { .frame = ARG_NONE };
- int r;
-
- if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) {
+ int r, slot;
+
+ /* Handle stack arg stores and loads. */
+ if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) {
+ slot = stack_arg_off_to_slot(insn->off);
+ if (slot >= 0) {
+ if (is_stack_arg_stx(insn))
+ at_out[MAX_BPF_REG + slot] = at_out[insn->src_reg];
+ else
+ at_out[MAX_BPF_REG + slot] = none;
+ }
+ } else if (is_stack_arg_ldx(insn)) {
+ slot = stack_arg_off_to_slot(insn->off);
+ at_out[insn->dst_reg] = (slot >= 0) ? at_stack_arg_entry[slot] : none;
+ } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) {
if (code == BPF_MOV) {
*dst = none;
} else if (dst->frame >= 0) {
@@ -1297,6 +1342,16 @@ static int record_load_store_access(struct bpf_verifier_env *env,
struct arg_track resolved, *ptr;
int oi;
+ /*
+ * Stack arg insns use dst_reg/src_reg=BPF_REG_PARAMS(11). Since at[]
+ * is extended to MAX_AT_TRACK_REGS, at[11] holds the arg_track for
+ * outgoing stack arg slot 0 — not the pointer used for the memory
+ * access. Skip so the slot's tracked value isn't confused with the
+ * base register that record_stack_access() expects.
+ */
+ if (is_stack_arg_stx(insn) || is_stack_arg_st(insn) || is_stack_arg_ldx(insn))
+ return 0;
+
switch (class) {
case BPF_LDX:
ptr = &at[insn->src_reg];
@@ -1343,6 +1398,42 @@ static int record_load_store_access(struct bpf_verifier_env *env,
return 0;
}
+static int record_arg_access(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ struct bpf_insn *insn,
+ struct arg_track *at, int arg_idx,
+ int insn_idx)
+{
+ int depth = instance->depth;
+ int frame = at->frame;
+ int err = 0;
+ s64 bytes;
+
+ if (!arg_is_fp(at))
+ return 0;
+
+ if (bpf_helper_call(insn)) {
+ bytes = bpf_helper_stack_access_bytes(env, insn, arg_idx, insn_idx);
+ } else if (bpf_pseudo_kfunc_call(insn)) {
+ bytes = bpf_kfunc_stack_access_bytes(env, insn, arg_idx, insn_idx);
+ } else {
+ for (int f = 0; f <= depth; f++) {
+ err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
+ if (err)
+ return err;
+ }
+ return 0;
+ }
+ if (bytes == 0)
+ return 0;
+
+ if (frame >= 0 && frame <= depth)
+ err = record_stack_access(instance, at, bytes, frame, insn_idx);
+ else if (frame == ARG_IMPRECISE)
+ err = record_imprecise(instance, at->mask, insn_idx);
+ return err;
+}
+
/* Record stack access for a given 'at' state of helper/kfunc 'insn' */
static int record_call_access(struct bpf_verifier_env *env,
struct func_instance *instance,
@@ -1350,9 +1441,8 @@ static int record_call_access(struct bpf_verifier_env *env,
int insn_idx)
{
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
- int depth = instance->depth;
struct bpf_call_summary cs;
- int r, err = 0, num_params = 5;
+ int r, err, num_params = 5;
if (bpf_pseudo_call(insn))
return 0;
@@ -1360,32 +1450,15 @@ static int record_call_access(struct bpf_verifier_env *env,
if (bpf_get_call_summary(env, insn, &cs))
num_params = cs.num_params;
- for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) {
- int frame = at[r].frame;
- s64 bytes;
-
- if (!arg_is_fp(&at[r]))
- continue;
-
- if (bpf_helper_call(insn)) {
- bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx);
- } else if (bpf_pseudo_kfunc_call(insn)) {
- bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx);
- } else {
- for (int f = 0; f <= depth; f++) {
- err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
- if (err)
- return err;
- }
- return 0;
- }
- if (bytes == 0)
- continue;
+ for (r = BPF_REG_1; r < BPF_REG_1 + min(num_params, MAX_BPF_FUNC_REG_ARGS); r++) {
+ err = record_arg_access(env, instance, insn, &at[r], r - 1, insn_idx);
+ if (err)
+ return err;
+ }
- if (frame >= 0 && frame <= depth)
- err = record_stack_access(instance, &at[r], bytes, frame, insn_idx);
- else if (frame == ARG_IMPRECISE)
- err = record_imprecise(instance, at[r].mask, insn_idx);
+ for (r = 0; r < MAX_STACK_ARG_SLOTS && r < num_params - MAX_BPF_FUNC_REG_ARGS; r++) {
+ err = record_arg_access(env, instance, insn, &at[MAX_BPF_REG + r],
+ r + MAX_BPF_FUNC_REG_ARGS, insn_idx);
if (err)
return err;
}
@@ -1445,7 +1518,7 @@ static int find_callback_subprog(struct bpf_verifier_env *env,
/* Per-subprog intermediate state kept alive across analysis phases */
struct subprog_at_info {
- struct arg_track (*at_in)[MAX_BPF_REG];
+ struct arg_track (*at_in)[MAX_AT_TRACK_REGS];
int len;
};
@@ -1479,6 +1552,9 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env,
for (r = 0; r < MAX_BPF_REG - 1; r++)
if (arg_is_fp(&info->at_in[i][r]))
has_extra = true;
+ for (r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+ if (arg_is_fp(&info->at_in[i][MAX_BPF_REG + r]))
+ has_extra = true;
}
if (is_ldx_stx_call) {
for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
@@ -1503,6 +1579,12 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env,
verbose(env, " r%d=", r);
verbose_arg_track(env, &info->at_in[i][r]);
}
+ for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) {
+ if (!arg_is_fp(&info->at_in[i][MAX_BPF_REG + r]))
+ continue;
+ verbose(env, " sa%d=", r);
+ verbose_arg_track(env, &info->at_in[i][MAX_BPF_REG + r]);
+ }
}
if (is_ldx_stx_call) {
@@ -1525,7 +1607,7 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env,
* Runs forward fixed-point with arg_track_xfer(), then records
* memory accesses in a single linear pass over converged state.
*
- * @callee_entry: pre-populated entry state for R1-R5
+ * @callee_entry: pre-populated entry state for R1-R5 and stack args
* NULL for main (subprog 0).
* @info: stores at_in, len for debug printing.
*/
@@ -1543,10 +1625,11 @@ static int compute_subprog_args(struct bpf_verifier_env *env,
int end = env->subprog_info[subprog + 1].start;
int po_end = env->subprog_info[subprog + 1].postorder_start;
int len = end - start;
- struct arg_track (*at_in)[MAX_BPF_REG] = NULL;
- struct arg_track at_out[MAX_BPF_REG];
+ struct arg_track (*at_in)[MAX_AT_TRACK_REGS] = NULL;
+ struct arg_track at_out[MAX_AT_TRACK_REGS];
struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL;
struct arg_track *at_stack_out = NULL;
+ struct arg_track at_stack_arg_entry[MAX_STACK_ARG_SLOTS];
struct arg_track unvisited = { .frame = ARG_UNVISITED };
struct arg_track none = { .frame = ARG_NONE };
bool changed;
@@ -1565,13 +1648,13 @@ static int compute_subprog_args(struct bpf_verifier_env *env,
goto err_free;
for (i = 0; i < len; i++) {
- for (r = 0; r < MAX_BPF_REG; r++)
+ for (r = 0; r < MAX_AT_TRACK_REGS; r++)
at_in[i][r] = unvisited;
for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
at_stack_in[i][r] = unvisited;
}
- for (r = 0; r < MAX_BPF_REG; r++)
+ for (r = 0; r < MAX_AT_TRACK_REGS; r++)
at_in[0][r] = none;
/* Entry: R10 is always precisely the current frame's FP */
@@ -1587,6 +1670,10 @@ static int compute_subprog_args(struct bpf_verifier_env *env,
for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
at_stack_in[0][r] = none;
+ /* Entry: incoming stack args from caller, or ARG_NONE for main */
+ for (r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+ at_stack_arg_entry[r] = callee_entry ? callee_entry[MAX_BPF_REG + r] : none;
+
if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth);
@@ -1605,7 +1692,8 @@ redo:
memcpy(at_out, at_in[i], sizeof(at_out));
memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out));
- arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites);
+ arg_track_xfer(env, insn, idx, at_out, at_stack_out,
+ at_stack_arg_entry, instance, callsites);
arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out);
/* Propagate to successors within this subprogram */
@@ -1619,7 +1707,7 @@ redo:
continue;
ti = target - start;
- for (r = 0; r < MAX_BPF_REG; r++)
+ for (r = 0; r < MAX_AT_TRACK_REGS; r++)
changed |= arg_track_join(env, idx, target, r,
&at_in[ti][r], at_out[r]);
@@ -1674,11 +1762,14 @@ err_free:
return err;
}
-/* Return true if any of R1-R5 is derived from a frame pointer. */
+/* Return true if any of R1-R5 or stack args is derived from a frame pointer. */
static bool has_fp_args(struct arg_track *args)
{
for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
- if (args[r].frame != ARG_NONE)
+ if (arg_is_fp(&args[r]))
+ return true;
+ for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+ if (arg_is_fp(&args[MAX_BPF_REG + r]))
return true;
return false;
}
@@ -1803,7 +1894,7 @@ static int analyze_subprog(struct bpf_verifier_env *env,
/* For each reachable call site in the subprog, recurse into callees */
for (int p = po_start; p < po_end; p++) {
int idx = env->cfg.insn_postorder[p];
- struct arg_track callee_args[BPF_REG_5 + 1];
+ struct arg_track callee_args[MAX_AT_TRACK_REGS] = {};
struct arg_track none = { .frame = ARG_NONE };
struct bpf_insn *insn = &insns[idx];
struct func_instance *callee_instance;
@@ -1818,9 +1909,11 @@ static int analyze_subprog(struct bpf_verifier_env *env,
if (callee < 0)
continue;
- /* Build entry args: R1-R5 from at_in at call site */
+ /* Build entry args: R1-R5 and stack args from at_in at call site */
for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
callee_args[r] = info[subprog].at_in[j][r];
+ for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+ callee_args[MAX_BPF_REG + r] = info[subprog].at_in[j][MAX_BPF_REG + r];
} else if (bpf_calls_callback(env, idx)) {
callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg);
if (callee == -2) {
@@ -1842,6 +1935,8 @@ static int analyze_subprog(struct bpf_verifier_env *env,
for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
callee_args[r] = none;
+ for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+ callee_args[MAX_BPF_REG + r] = none;
callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg];
} else {
continue;
@@ -1914,26 +2009,15 @@ int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env)
return -ENOMEM;
}
- instance = call_instance(env, NULL, 0, 0);
- if (IS_ERR(instance)) {
- err = PTR_ERR(instance);
- goto out;
- }
- err = analyze_subprog(env, NULL, info, instance, callsites);
- if (err)
- goto out;
-
/*
- * Subprogs and callbacks that don't receive FP-derived arguments
- * cannot access ancestor stack frames, so they were skipped during
- * the recursive walk above. Async callbacks (timer, workqueue) are
- * also not reachable from the main program's call graph. Analyze
- * all unvisited subprogs as independent roots at depth 0.
+ * Analyze every subprog in reverse topological order (callers
+ * before callees) so that each subprog is analyzed before its
+ * callees, allowing the recursive walk inside analyze_subprog()
+ * to naturally reach callees that receive FP-derived args.
*
- * Use reverse topological order (callers before callees) so that
- * each subprog is analyzed before its callees, allowing the
- * recursive walk inside analyze_subprog() to naturally
- * reach nested callees that also lack FP-derived args.
+ * Subprogs and callbacks that don't receive FP-derived arguments
+ * cannot access ancestor stack frames are analyzed independently.
+ * Async callbacks (timer, workqueue) are handled the same way.
*/
for (k = env->subprog_cnt - 1; k >= 0; k--) {
int sub = env->subprog_topo_order[k];
@@ -2096,7 +2180,7 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env,
def = ALL_CALLER_SAVED_REGS;
use = def & ~BIT(BPF_REG_0);
if (bpf_get_call_summary(env, insn, &cs))
- use = GENMASK(cs.num_params, 1);
+ use = GENMASK(min_t(u8, cs.num_params, MAX_BPF_FUNC_REG_ARGS), 1);
break;
default:
def = 0;
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 011e4ec25acd..b740fa73ee26 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -13,17 +13,17 @@
#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
-static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+static bool bpf_verifier_log_attr_valid(u32 log_level, char __user *log_buf, u32 log_size)
{
/* ubuf and len_total should both be specified (or not) together */
- if (!!log->ubuf != !!log->len_total)
+ if (!!log_buf != !!log_size)
return false;
/* log buf without log_level is meaningless */
- if (log->ubuf && log->level == 0)
+ if (log_buf && log_level == 0)
return false;
- if (log->level & ~BPF_LOG_MASK)
+ if (log_level & ~BPF_LOG_MASK)
return false;
- if (log->len_total > UINT_MAX >> 2)
+ if (log_size > UINT_MAX >> 2)
return false;
return true;
}
@@ -36,7 +36,7 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
log->len_total = log_size;
/* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log))
+ if (!bpf_verifier_log_attr_valid(log_level, log_buf, log_size))
return -EINVAL;
return 0;
@@ -571,20 +571,20 @@ static void print_scalar_ranges(struct bpf_verifier_env *env,
u64 val;
bool omit;
} minmaxs[] = {
- {"smin", reg->smin_value, reg->smin_value == S64_MIN},
- {"smax", reg->smax_value, reg->smax_value == S64_MAX},
- {"umin", reg->umin_value, reg->umin_value == 0},
- {"umax", reg->umax_value, reg->umax_value == U64_MAX},
+ {"smin", reg_smin(reg), reg_smin(reg) == S64_MIN},
+ {"smax", reg_smax(reg), reg_smax(reg) == S64_MAX},
+ {"umin", reg_umin(reg), reg_umin(reg) == 0},
+ {"umax", reg_umax(reg), reg_umax(reg) == U64_MAX},
{"smin32",
- is_snum_decimal((s64)reg->s32_min_value)
- ? (s64)reg->s32_min_value
- : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+ is_snum_decimal((s64)reg_s32_min(reg))
+ ? (s64)reg_s32_min(reg)
+ : (u32)reg_s32_min(reg), reg_s32_min(reg) == S32_MIN},
{"smax32",
- is_snum_decimal((s64)reg->s32_max_value)
- ? (s64)reg->s32_max_value
- : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
- {"umin32", reg->u32_min_value, reg->u32_min_value == 0},
- {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX},
+ is_snum_decimal((s64)reg_s32_max(reg))
+ ? (s64)reg_s32_max(reg)
+ : (u32)reg_s32_max(reg), reg_s32_max(reg) == S32_MAX},
+ {"umin32", reg_u32_min(reg), reg_u32_min(reg) == 0},
+ {"umax32", reg_u32_max(reg), reg_u32_max(reg) == U32_MAX},
}, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
bool neg1, neg2;
@@ -665,8 +665,8 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
if (reg->id & BPF_ADD_CONST)
verbose(env, "%+d", reg->delta);
- if (reg->ref_obj_id)
- verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+ if (reg->parent_id)
+ verbose_a("parent_id=%d", reg->parent_id);
if (type_is_non_owning_ref(reg->type))
verbose_a("%s", "non_own_ref");
if (type_is_map_ptr(t)) {
@@ -768,21 +768,19 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
if (reg->id)
verbose_a("id=%d", reg->id);
- if (reg->ref_obj_id)
- verbose_a("ref_id=%d", reg->ref_obj_id);
- if (reg->dynptr_id)
- verbose_a("dynptr_id=%d", reg->dynptr_id);
+ if (reg->parent_id)
+ verbose_a("parent_id=%d", reg->parent_id);
verbose(env, ")");
break;
case STACK_ITER:
- /* only main slot has ref_obj_id set; skip others */
- if (!reg->ref_obj_id)
+ /* only main slot has id set; skip others */
+ if (!reg->id)
continue;
- verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ verbose(env, " fp%d=iter_%s(id=%d,state=%s,depth=%u)",
(-i - 1) * BPF_REG_SIZE,
iter_type_str(reg->iter.btf, reg->iter.btf_id),
- reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->id, iter_state_str(reg->iter.state),
reg->iter.depth);
break;
case STACK_MISC:
@@ -825,3 +823,81 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
}
print_verifier_state(env, vstate, frameno, false);
}
+
+int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level,
+ u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common,
+ bpfptr_t uattr_common, u32 size_common)
+{
+ char __user *ubuf_common = u64_to_user_ptr(common->log_buf);
+ char __user *ubuf = u64_to_user_ptr(log_buf);
+
+ if (!bpf_verifier_log_attr_valid(common->log_level, ubuf_common, common->log_size) ||
+ !bpf_verifier_log_attr_valid(log_level, ubuf, log_size))
+ return -EINVAL;
+
+ if (ubuf && ubuf_common && (ubuf != ubuf_common || log_size != common->log_size ||
+ log_level != common->log_level))
+ return -EINVAL;
+
+ memset(log, 0, sizeof(*log));
+ log->ubuf = ubuf;
+ log->size = log_size;
+ log->level = log_level;
+ log->offsetof_true_size = offsetof_log_true_size;
+ log->uattr = uattr;
+
+ if (!ubuf && ubuf_common) {
+ log->ubuf = ubuf_common;
+ log->size = common->log_size;
+ log->level = common->log_level;
+ log->uattr = uattr_common;
+ log->offsetof_true_size = 0;
+ if (size_common >= offsetofend(struct bpf_common_attr, log_true_size))
+ log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size);
+ }
+ return 0;
+}
+
+struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log,
+ struct bpf_common_attr *common, bpfptr_t uattr,
+ u32 size)
+{
+ struct bpf_verifier_log *log;
+ int err;
+
+ memset(attr_log, 0, sizeof(*attr_log));
+ attr_log->uattr = uattr;
+ if (size >= offsetofend(struct bpf_common_attr, log_true_size))
+ attr_log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size);
+
+ if (!size)
+ return NULL;
+
+ log = kzalloc_obj(*log, GFP_KERNEL);
+ if (!log)
+ return ERR_PTR(-ENOMEM);
+
+ err = bpf_vlog_init(log, common->log_level, u64_to_user_ptr(common->log_buf),
+ common->log_size);
+ if (err) {
+ kfree(log);
+ return ERR_PTR(err);
+ }
+
+ return log;
+}
+
+int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log)
+{
+ u32 log_true_size;
+ int err;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ if (attr->offsetof_true_size &&
+ copy_to_bpfptr_offset(attr->uattr, attr->offsetof_true_size, &log_true_size,
+ sizeof(log_true_size)))
+ return -EFAULT;
+
+ return err;
+}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 0f57608b385d..4d6f25db9ba1 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -246,7 +246,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
/* Start walking the trie from the root node ... */
- for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
+ for (node = rcu_dereference_check(trie->root, bpf_rcu_lock_held());
node;) {
unsigned int next_bit;
size_t matchlen;
@@ -280,7 +280,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
*/
next_bit = extract_bit(key->data, node->prefixlen);
node = rcu_dereference_check(node->child[next_bit],
- rcu_read_lock_bh_held());
+ bpf_rcu_lock_held());
}
if (!found)
@@ -359,7 +359,7 @@ static long trie_update_elem(struct bpf_map *map,
*/
slot = &trie->root;
- while ((node = rcu_dereference(*slot))) {
+ while ((node = rcu_dereference_protected(*slot, 1))) {
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
@@ -482,7 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
trim = &trie->root;
trim2 = trim;
parent = NULL;
- while ((node = rcu_dereference(*trim))) {
+ while ((node = rcu_dereference_protected(*trim, 1))) {
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 645bd30bc9a9..d2cbab4bdf64 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -20,7 +20,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Does not support >1 level map-in-map */
if (inner_map->inner_map_meta)
return ERR_PTR(-EINVAL);
-
+ if (inner_map->excl_prog_sha)
+ return ERR_PTR(-ENOTSUPP);
if (!inner_map->ops->map_meta_equal)
return ERR_PTR(-ENOTSUPP);
@@ -101,6 +102,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
+ if (inner_map->excl_prog_sha)
+ return ERR_PTR(-ENOTSUPP);
inner_map_meta = map->inner_map_meta;
if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map))
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 261a03ea73d3..c19b360bad9e 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -112,6 +112,10 @@ static int bpf_iter_attach_map(struct bpf_prog *prog,
map = bpf_map_get_with_uref(linfo->map.map_fd);
if (IS_ERR(map))
return PTR_ERR(map);
+ if (map->excl_prog_sha) {
+ err = -EPERM;
+ goto put_map;
+ }
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -119,7 +123,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog,
is_percpu = true;
else if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY)
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_RHASH)
goto put_map;
key_acc_size = prog->aux->max_rdonly_access;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index da3d328f5c15..77ba03216c09 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,6 +9,7 @@
#include <linux/perf_event.h>
#include <linux/btf_ids.h>
#include <linux/buildid.h>
+#include <linux/mmap_lock.h>
#include "percpu_freelist.h"
#include "mmap_unlock_work.h"
@@ -152,6 +153,180 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b
: build_id_parse_nofault(vma, build_id, NULL);
}
+static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id)
+{
+ id->status = BPF_STACK_BUILD_ID_IP;
+ memset(id->build_id, 0, BUILD_ID_SIZE_MAX);
+}
+
+static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff,
+ unsigned long vm_start, u64 ip)
+{
+ return (vm_pgoff << PAGE_SHIFT) + ip - vm_start;
+}
+
+static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id,
+ u64 offset,
+ const unsigned char *build_id)
+{
+ id->status = BPF_STACK_BUILD_ID_VALID;
+ id->offset = offset;
+ if (id->build_id != build_id)
+ memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX);
+}
+
+struct stack_map_vma_lock {
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+};
+
+/*
+ * Acquire a stable read-side reference on the VMA covering @ip.
+ *
+ * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read
+ * lock held and mmap_lock dropped, so the caller may sleep.
+ *
+ * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still
+ * held; the caller must snapshot any fields it needs and pin vm_file
+ * with get_file() before stack_map_unlock_vma() drops mmap_lock, as
+ * the VMA may be split, merged, or freed after that.
+ *
+ * Returns NULL on failure, in which case no lock is held.
+ */
+static struct vm_area_struct *
+stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip)
+{
+ struct mm_struct *mm = lock->mm;
+ struct vm_area_struct *vma;
+
+ /* noop under !CONFIG_PER_VMA_LOCK */
+ vma = lock_vma_under_rcu(mm, ip);
+ if (vma) {
+ lock->vma = vma;
+ return vma;
+ }
+
+ /*
+ * Taking mmap_read_lock() is unsafe here, because the caller BPF
+ * program might already hold it, causing a deadlock.
+ */
+ if (!mmap_read_trylock(mm))
+ return NULL;
+
+ vma = vma_lookup(mm, ip);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!vma_start_read_locked(vma)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+ mmap_read_unlock(mm);
+#endif
+
+ lock->vma = vma;
+ return vma;
+}
+
+static void stack_map_unlock_vma(struct stack_map_vma_lock *lock)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ vma_end_read(lock->vma);
+#else
+ mmap_read_unlock(lock->mm);
+#endif
+ lock->vma = NULL;
+}
+
+static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs,
+ u32 trace_nr)
+{
+ struct mm_struct *mm = current->mm;
+ struct stack_map_vma_lock lock = { .mm = mm };
+ struct {
+ struct file *file;
+ const unsigned char *build_id;
+ unsigned long vm_start;
+ unsigned long vm_end;
+ unsigned long vm_pgoff;
+ } cache = {};
+ unsigned long vm_pgoff, vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct file *file;
+ u64 offset;
+ u64 ip;
+
+ for (u32 i = 0; i < trace_nr; i++) {
+ ip = READ_ONCE(id_offs[i].ip);
+
+ /*
+ * Range cache fast path: if ip falls within the previously
+ * resolved VMA range, reuse the cache build_id without
+ * re-acquiring the VMA lock.
+ */
+ if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) {
+ offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip);
+ stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id);
+ continue;
+ }
+
+ vma = stack_map_lock_vma(&lock, ip);
+ if (!vma) {
+ stack_map_build_id_set_ip(&id_offs[i]);
+ continue;
+ }
+ if (vma_is_anonymous(vma) || !vma->vm_file) {
+ stack_map_build_id_set_ip(&id_offs[i]);
+ stack_map_unlock_vma(&lock);
+ continue;
+ }
+
+ file = vma->vm_file;
+ vm_pgoff = vma->vm_pgoff;
+ vm_start = vma->vm_start;
+ vm_end = vma->vm_end;
+ offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip);
+
+ /*
+ * Same backing file as previous (e.g. different VMAs
+ * of the same ELF binary). Reuse the cache build_id.
+ */
+ if (file == cache.file) {
+ stack_map_unlock_vma(&lock);
+ stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id);
+ cache.vm_start = vm_start;
+ cache.vm_end = vm_end;
+ cache.vm_pgoff = vm_pgoff;
+ continue;
+ }
+
+ file = get_file(file);
+ stack_map_unlock_vma(&lock);
+
+ /* build_id_parse_file() may block on filesystem reads */
+ if (build_id_parse_file(file, id_offs[i].build_id, NULL)) {
+ stack_map_build_id_set_ip(&id_offs[i]);
+ fput(file);
+ continue;
+ }
+
+ stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id);
+ if (cache.file)
+ fput(cache.file);
+ cache.file = file;
+ cache.build_id = id_offs[i].build_id;
+ cache.vm_start = vm_start;
+ cache.vm_end = vm_end;
+ cache.vm_pgoff = vm_pgoff;
+ }
+
+ if (cache.file)
+ fput(cache.file);
+}
+
/*
* Expects all id_offs[i].ip values to be set to correct initial IPs.
* They will be subsequently:
@@ -165,44 +340,50 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u32 trace_nr, bool user, bool may_fault)
{
- int i;
struct mmap_unlock_irq_work *work = NULL;
bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+ bool has_user_ctx = user && current && current->mm;
struct vm_area_struct *vma, *prev_vma = NULL;
- const char *prev_build_id;
+ const unsigned char *prev_build_id = NULL;
+ int i;
+
+ if (may_fault && has_user_ctx) {
+ stack_map_get_build_id_offset_sleepable(id_offs, trace_nr);
+ return;
+ }
/* If the irq_work is in use, fall back to report ips. Same
* fallback is used for kernel stack (!user) on a stackmap with
* build_id.
*/
- if (!user || !current || !current->mm || irq_work_busy ||
- !mmap_read_trylock(current->mm)) {
+ if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) {
/* cannot access current->mm, fall back to ips */
- for (i = 0; i < trace_nr; i++) {
- id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
- }
+ for (i = 0; i < trace_nr; i++)
+ stack_map_build_id_set_ip(&id_offs[i]);
return;
}
for (i = 0; i < trace_nr; i++) {
u64 ip = READ_ONCE(id_offs[i].ip);
+ u64 offset;
- if (range_in_vma(prev_vma, ip, ip)) {
+ if (prev_build_id && range_in_vma(prev_vma, ip, ip)) {
vma = prev_vma;
- memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
- goto build_id_valid;
+ offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip);
+ stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id);
+ continue;
}
vma = find_vma(current->mm, ip);
- if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
+ if (!vma || vma_is_anonymous(vma) ||
+ fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
/* per entry fall back to ips */
- id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
+ stack_map_build_id_set_ip(&id_offs[i]);
+ prev_vma = vma;
+ prev_build_id = NULL;
continue;
}
-build_id_valid:
- id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
- id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip);
+ stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id);
prev_vma = vma;
prev_build_id = id_offs[i].build_id;
}
diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c
index 8478d2c6ed5b..32f346ce3ffc 100644
--- a/kernel/bpf/states.c
+++ b/kernel/bpf/states.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
+#include <linux/cnum.h>
#include <linux/filter.h>
#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
@@ -301,14 +302,8 @@ int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_s
static bool range_within(const struct bpf_reg_state *old,
const struct bpf_reg_state *cur)
{
- return old->umin_value <= cur->umin_value &&
- old->umax_value >= cur->umax_value &&
- old->smin_value <= cur->smin_value &&
- old->smax_value >= cur->smax_value &&
- old->u32_min_value <= cur->u32_min_value &&
- old->u32_max_value >= cur->u32_max_value &&
- old->s32_min_value <= cur->s32_min_value &&
- old->s32_max_value >= cur->s32_max_value;
+ return cnum64_is_subset(old->r64, cur->r64) &&
+ cnum32_is_subset(old->r32, cur->r32);
}
/* If in the old state two registers had the same id, then they need to have
@@ -348,8 +343,12 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
return true;
}
- /* We ran out of idmap slots, which should be impossible */
- WARN_ON_ONCE(1);
+ /*
+ * idmap slots are bounded by the number of registers and stack slots.
+ * Since referenced dynptrs acquire intermediate references that do
+ * not live in either, so the map can be exhausted. Since it is unlikely,
+ * fail the verification by treating the states as not equivalent.
+ */
return false;
}
@@ -494,7 +493,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,
{
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
check_ids(rold->id, rcur->id, idmap) &&
- check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+ check_ids(rold->parent_id, rcur->parent_id, idmap);
}
enum exact_level {
@@ -619,7 +618,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off) &&
check_ids(rold->id, rcur->id, idmap) &&
- check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+ check_ids(rold->parent_id, rcur->parent_id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
/* We must have at least as much range as the old ptr
@@ -799,7 +798,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
cur_reg = &cur->stack[spi].spilled_ptr;
if (old_reg->dynptr.type != cur_reg->dynptr.type ||
old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
- !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ !check_ids(old_reg->id, cur_reg->id, idmap) ||
+ !check_ids(old_reg->parent_id, cur_reg->parent_id, idmap))
return false;
break;
case STACK_ITER:
@@ -815,13 +815,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
old_reg->iter.btf_id != cur_reg->iter.btf_id ||
old_reg->iter.state != cur_reg->iter.state ||
/* ignore {old_reg,cur_reg}->iter.depth, see above */
- !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ !check_ids(old_reg->id, cur_reg->id, idmap))
return false;
break;
case STACK_IRQ_FLAG:
old_reg = &old->stack[spi].spilled_ptr;
cur_reg = &cur->stack[spi].spilled_ptr;
- if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
+ if (!check_ids(old_reg->id, cur_reg->id, idmap) ||
old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
return false;
break;
@@ -838,6 +838,32 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return true;
}
+/*
+ * Compare stack arg slots between old and current states.
+ * Outgoing stack args are path-local state and must agree for pruning.
+ */
+static bool stack_arg_safe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_idmap *idmap,
+ enum exact_level exact)
+{
+ int i, nslots;
+
+ nslots = max(old->out_stack_arg_cnt, cur->out_stack_arg_cnt);
+ for (i = 0; i < nslots; i++) {
+ struct bpf_reg_state *old_arg, *cur_arg;
+ struct bpf_reg_state not_init = { .type = NOT_INIT };
+
+ old_arg = i < old->out_stack_arg_cnt ?
+ &old->stack_arg_regs[i] : &not_init;
+ cur_arg = i < cur->out_stack_arg_cnt ?
+ &cur->stack_arg_regs[i] : &not_init;
+ if (!regsafe(env, old_arg, cur_arg, idmap, exact))
+ return false;
+ }
+
+ return true;
+}
+
static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
struct bpf_idmap *idmap)
{
@@ -868,6 +894,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
return false;
switch (old->refs[i].type) {
case REF_TYPE_PTR:
+ if (!check_ids(old->refs[i].parent_id, cur->refs[i].parent_id, idmap))
+ return false;
+ break;
case REF_TYPE_IRQ:
break;
case REF_TYPE_LOCK:
@@ -920,6 +949,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
if (old->callback_depth > cur->callback_depth)
return false;
+ if (!old->no_stack_arg_load && cur->no_stack_arg_load)
+ return false;
+
for (i = 0; i < MAX_BPF_REG; i++)
if (((1 << i) & live_regs) &&
!regsafe(env, &old->regs[i], &cur->regs[i],
@@ -929,6 +961,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
return false;
+ if (!stack_arg_safe(env, old, cur, &env->idmap_scratch, exact))
+ return false;
+
return true;
}
@@ -1376,7 +1411,7 @@ hit:
*/
err = 0;
if (bpf_is_jmp_point(env, env->insn_idx))
- err = bpf_push_jmp_history(env, cur, 0, 0);
+ err = bpf_push_jmp_history(env, cur, 0, 0, 0, 0);
err = err ? : propagate_precision(env, &sl->state, cur, NULL);
if (err)
return err;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a3c0214ca934..b44106c8ea75 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -41,6 +41,7 @@
#include <linux/overflow.h>
#include <linux/cookie.h>
#include <linux/verification.h>
+#include <linux/btf_ids.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -807,6 +808,11 @@ void bpf_obj_free_task_work(const struct btf_record *rec, void *obj)
bpf_task_work_cancel_and_free(obj + rec->task_work_off);
}
+void bpf_obj_cancel_fields(struct bpf_map *map, void *obj)
+{
+ bpf_map_free_internal_structs(map, obj);
+}
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -1280,6 +1286,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
case BPF_SPIN_LOCK:
case BPF_RES_SPIN_LOCK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_RHASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
@@ -1294,6 +1301,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
case BPF_WORKQUEUE:
case BPF_TASK_WORK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_RHASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
ret = -EOPNOTSUPP;
@@ -1305,6 +1313,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
case BPF_KPTR_PERCPU:
case BPF_REFCOUNT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_RHASH &&
map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
@@ -1359,7 +1368,8 @@ free_map_tab:
#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
-static int map_create(union bpf_attr *attr, bpfptr_t uattr)
+static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log,
+ struct bpf_map **mapp, struct bpf_token **tokenp)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
@@ -1367,12 +1377,13 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
u32 map_type = attr->map_type;
struct bpf_map *map;
bool token_flag;
- int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
- if (err)
+ if (err) {
+ bpf_log(log, "Invalid attr.\n");
return -EINVAL;
+ }
/* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
* to avoid per-map type checks tripping on unknown flag
@@ -1381,31 +1392,40 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
attr->map_flags &= ~BPF_F_TOKEN_FD;
if (attr->btf_vmlinux_value_type_id) {
- if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
- attr->btf_key_type_id || attr->btf_value_type_id)
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+ bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n");
return -EINVAL;
+ }
+ if (attr->btf_key_type_id || attr->btf_value_type_id) {
+ bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n");
+ return -EINVAL;
+ }
} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+ bpf_log(log, "Invalid btf_value_type_id.\n");
return -EINVAL;
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
attr->map_type != BPF_MAP_TYPE_ARENA &&
- attr->map_extra != 0)
+ attr->map_type != BPF_MAP_TYPE_RHASH &&
+ attr->map_extra != 0) {
+ bpf_log(log, "Invalid map_extra.\n");
return -EINVAL;
-
- f_flags = bpf_get_file_flag(attr->map_flags);
- if (f_flags < 0)
- return f_flags;
+ }
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
- !node_online(numa_node)))
+ !node_online(numa_node))) {
+ bpf_log(log, "Invalid numa_node.\n");
return -EINVAL;
+ }
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map_type = attr->map_type;
- if (map_type >= ARRAY_SIZE(bpf_map_types))
+ if (map_type >= ARRAY_SIZE(bpf_map_types)) {
+ bpf_log(log, "Invalid map_type.\n");
return -EINVAL;
+ }
map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
ops = bpf_map_types[map_type];
if (!ops)
@@ -1423,8 +1443,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
if (token_flag) {
token = bpf_token_get_from_fd(attr->map_token_fd);
- if (IS_ERR(token))
+ if (IS_ERR(token)) {
+ bpf_log(log, "Invalid map_token_fd.\n");
return PTR_ERR(token);
+ }
/* if current token doesn't grant map creation permissions,
* then we can't use this token, so ignore it and rely on
@@ -1457,6 +1479,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_MAP_TYPE_CGROUP_ARRAY:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_RHASH:
case BPF_MAP_TYPE_PERCPU_HASH:
case BPF_MAP_TYPE_HASH_OF_MAPS:
case BPF_MAP_TYPE_RINGBUF:
@@ -1507,8 +1530,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
- if (err < 0)
+ if (err < 0) {
+ bpf_log(log, "Invalid map_name.\n");
goto free_map;
+ }
preempt_disable();
map->cookie = gen_cookie_next(&bpf_map_cookie);
@@ -1531,6 +1556,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
+ bpf_log(log, "Invalid btf_fd.\n");
err = PTR_ERR(btf);
goto free_map;
}
@@ -1558,6 +1584,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
+ bpf_log(log, "Invalid excl_prog_hash_size.\n");
err = -EINVAL;
goto free_map;
}
@@ -1572,11 +1599,62 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
err = -EFAULT;
goto free_map;
}
+
+ /* See libbpf: emit_signature_match() */
+ BUILD_BUG_ON(offsetof(struct bpf_map, excl) != SHA256_DIGEST_SIZE);
+ BUILD_BUG_ON(!__same_type(map->excl, u32));
+ BUILD_BUG_ON(offsetof(struct bpf_map, sha) != 0);
+ BUILD_BUG_ON(!__same_type(map->sha, u8[SHA256_DIGEST_SIZE]));
+ map->excl = 1;
} else if (attr->excl_prog_hash_size) {
+ bpf_log(log, "Invalid excl_prog_hash_size.\n");
err = -EINVAL;
goto free_map;
}
+ *mapp = map;
+ *tokenp = token;
+ return 0;
+
+free_map:
+ bpf_map_free(map);
+put_token:
+ bpf_token_put(token);
+ return err;
+}
+
+static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common,
+ bpfptr_t uattr_common, u32 size_common)
+{
+ struct bpf_token *token = NULL;
+ struct bpf_verifier_log *log;
+ struct bpf_log_attr attr_log;
+ struct bpf_map *map = NULL;
+ int err, ret;
+ int f_flags;
+
+ log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common);
+ if (IS_ERR(log))
+ return PTR_ERR(log);
+
+ err = map_create_alloc(attr, uattr, log, &map, &token);
+
+ /* preserve original error even if log finalization is successful */
+ ret = bpf_log_attr_finalize(&attr_log, log);
+ if (ret)
+ err = ret;
+
+ kfree(log);
+
+ if (err)
+ goto free_map;
+
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0) {
+ err = f_flags;
+ goto free_map;
+ }
+
err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
if (err)
goto free_map_sec;
@@ -1605,8 +1683,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
free_map_sec:
security_bpf_map_free(map);
free_map:
- bpf_map_free(map);
-put_token:
+ if (map)
+ bpf_map_free(map);
bpf_token_put(token);
return err;
}
@@ -2192,6 +2270,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_RHASH ||
map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
@@ -2646,7 +2725,8 @@ static int
bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
struct btf *attach_btf, u32 btf_id,
- struct bpf_prog *dst_prog)
+ struct bpf_prog *dst_prog,
+ bool multi_func)
{
if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
@@ -2666,6 +2746,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
}
}
+ if (multi_func) {
+ if (prog_type != BPF_PROG_TYPE_TRACING)
+ return -EINVAL;
+ if (!attach_btf || btf_id)
+ return -EINVAL;
+ return 0;
+ }
+
if (attach_btf && (!btf_id || dst_prog))
return -EINVAL;
@@ -2798,8 +2886,22 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
}
+static enum bpf_sig_keyring bpf_classify_keyring(s32 keyring_id)
+{
+ switch (keyring_id) {
+ case 0:
+ return BPF_SIG_KEYRING_BUILTIN;
+ case (s32)(unsigned long)VERIFY_USE_SECONDARY_KEYRING:
+ return BPF_SIG_KEYRING_SECONDARY;
+ case (s32)(unsigned long)VERIFY_USE_PLATFORM_KEYRING:
+ return BPF_SIG_KEYRING_PLATFORM;
+ default:
+ return BPF_SIG_KEYRING_USER;
+ }
+}
+
static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr,
- bool is_kernel)
+ bool is_kernel, s32 *keyring_serial)
{
bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
struct bpf_dynptr_kern sig_ptr, insns_ptr;
@@ -2835,7 +2937,8 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr,
(struct bpf_dynptr *)&sig_ptr, key);
-
+ if (!err)
+ *keyring_serial = bpf_key_serial(key);
bpf_key_put(key);
kvfree(sig);
return err;
@@ -2858,10 +2961,15 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
return 0;
}
+extern int bpf_multi_func(void);
+int __init __used bpf_multi_func(void) { return 0; }
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_multi_func_btf_id, func, bpf_multi_func)
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD keyring_id
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
@@ -2870,6 +2978,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
bool bpf_cap;
int err;
char license[128];
+ bool multi_func;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
@@ -2936,6 +3045,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
goto put_token;
+ multi_func = is_tracing_multi(attr->expected_attach_type);
+
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
* or btf, we need to check which one it is
*/
@@ -2957,7 +3068,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
goto put_token;
}
}
- } else if (attr->attach_btf_id) {
+ } else if (attr->attach_btf_id || multi_func) {
/* fall back to vmlinux BTF, if BTF type ID is specified */
attach_btf = bpf_get_btf_vmlinux();
if (IS_ERR(attach_btf)) {
@@ -2973,7 +3084,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attach_btf, attr->attach_btf_id,
- dst_prog)) {
+ dst_prog, multi_func)) {
if (dst_prog)
bpf_prog_put(dst_prog);
if (attach_btf)
@@ -2996,7 +3107,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
prog->expected_attach_type = attr->expected_attach_type;
prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
prog->aux->attach_btf = attach_btf;
- prog->aux->attach_btf_id = attr->attach_btf_id;
+ prog->aux->attach_btf_id = multi_func ? bpf_multi_func_btf_id[0] : attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
prog->aux->dev_bound = !!attr->prog_ifindex;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
@@ -3022,13 +3133,17 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
/* eBPF programs must be GPL compatible to use GPL-ed functions */
prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
-
if (attr->signature) {
- err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel);
+ err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel,
+ &prog->aux->sig.keyring_serial);
if (err)
goto free_prog;
+ prog->aux->sig.keyring_type = bpf_classify_keyring(attr->keyring_id);
+ prog->aux->sig.verdict = BPF_SIG_VERIFIED;
+ } else {
+ prog->aux->sig.keyring_type = BPF_SIG_KEYRING_NONE;
+ prog->aux->sig.verdict = BPF_SIG_UNSIGNED;
}
-
prog->orig_prog = NULL;
prog->jited = 0;
@@ -3076,10 +3191,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel);
if (err)
- goto free_prog_sec;
+ goto free_prog;
/* run eBPF verifier */
- err = bpf_check(&prog, attr, uattr, uattr_size);
+ err = bpf_check(&prog, attr, uattr, attr_log);
if (err < 0)
goto free_used_maps;
@@ -3122,8 +3237,6 @@ free_used_maps:
__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
-free_prog_sec:
- security_bpf_prog_free(prog);
free_prog:
free_uid(prog->aux->user);
if (prog->aux->attach_btf)
@@ -3198,6 +3311,15 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
bpf_link_init_sleepable(link, type, ops, prog, attach_type, false);
}
+void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ enum bpf_attach_type attach_type, u64 cookie)
+{
+ bpf_link_init(&link->link, type, ops, prog, attach_type);
+ link->node.link = &link->link;
+ link->node.cookie = cookie;
+}
+
static void bpf_link_free_id(int id)
{
if (!id)
@@ -3358,7 +3480,7 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ?
"kretprobe_multi" : "kprobe_multi");
else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI)
- seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ?
+ seq_printf(m, "link_type:\t%s\n", link->flags & BPF_F_UPROBE_MULTI_RETURN ?
"uretprobe_multi" : "uprobe_multi");
else
seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
@@ -3505,7 +3627,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
struct bpf_tracing_link *tr_link =
container_of(link, struct bpf_tracing_link, link.link);
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link.node,
tr_link->trampoline,
tr_link->tgt_prog));
@@ -3518,8 +3640,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
static void bpf_tracing_link_dealloc(struct bpf_link *link)
{
- struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link.link);
+ struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link);
kfree(tr_link);
}
@@ -3527,8 +3648,8 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link)
static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
- struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link.link);
+ struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link);
+
u32 target_btf_id, target_obj_id;
bpf_trampoline_unpack_key(tr_link->trampoline->key,
@@ -3541,17 +3662,16 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
link->attach_type,
target_obj_id,
target_btf_id,
- tr_link->link.cookie);
+ tr_link->link.node.cookie);
}
static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
- struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link.link);
+ struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link);
info->tracing.attach_type = link->attach_type;
- info->tracing.cookie = tr_link->link.cookie;
+ info->tracing.cookie = tr_link->link.node.cookie;
bpf_trampoline_unpack_key(tr_link->trampoline->key,
&info->tracing.target_obj_id,
&info->tracing.target_btf_id);
@@ -3633,29 +3753,18 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
}
- if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
- struct bpf_fsession_link *fslink;
-
- fslink = kzalloc_obj(*fslink, GFP_USER);
- if (fslink) {
- bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
- &bpf_tracing_link_lops, prog, attach_type);
- fslink->fexit.cookie = bpf_cookie;
- link = &fslink->link;
- } else {
- link = NULL;
- }
- } else {
- link = kzalloc_obj(*link, GFP_USER);
- }
+ link = kzalloc_obj(*link, GFP_USER);
if (!link) {
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
- &bpf_tracing_link_lops, prog, attach_type);
+ bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog, attach_type, bpf_cookie);
- link->link.cookie = bpf_cookie;
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ link->fexit.link = &link->link.link;
+ link->fexit.cookie = bpf_cookie;
+ }
mutex_lock(&prog->aux->dst_mutex);
@@ -3758,7 +3867,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
- err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
+ err = bpf_trampoline_link_prog(&link->link.node, tr, tgt_prog);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -4281,6 +4390,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
if (!btp)
return -ENOENT;
+ if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) {
+ bpf_put_raw_tracepoint(btp);
+ return -EINVAL;
+ }
+
link = kzalloc_obj(*link, GFP_USER);
if (!link) {
err = -ENOMEM;
@@ -4389,6 +4503,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_TRACE_FSESSION:
+ case BPF_TRACE_FSESSION_MULTI:
+ case BPF_TRACE_FENTRY_MULTI:
+ case BPF_TRACE_FEXIT_MULTI:
case BPF_MODIFY_RETURN:
return BPF_PROG_TYPE_TRACING;
case BPF_LSM_MAC:
@@ -4654,7 +4771,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
#define BPF_PROG_QUERY_LAST_FIELD query.revision
static int bpf_prog_query(const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ union bpf_attr __user *uattr, u32 uattr_size)
{
if (!bpf_net_capable())
return -EPERM;
@@ -4693,7 +4810,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
case BPF_LSM_CGROUP:
- return cgroup_bpf_prog_query(attr, uattr);
+ return cgroup_bpf_prog_query(attr, uattr, uattr_size);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
@@ -4919,6 +5036,29 @@ out:
return map;
}
+static void prepare_dump_pseudo_call(struct bpf_insn *insn)
+{
+ s32 call_off = insn->imm;
+
+ /*
+ * BPF_CALL_ARGS only exists for interpreter fallback.
+ * 1. For interpreter (BPF_CALL_ARGS): insn->off is the index of
+ * interpreters_args array, so here using bpf_call_args_imm()
+ * to get the real address offset.
+ * 2. For JIT (BPF_CALL): insn->off is the subprog id.
+ */
+ if (insn->code == (BPF_JMP | BPF_CALL_ARGS))
+ insn->imm = bpf_call_args_imm(insn->off);
+ else
+ insn->imm = insn->off;
+
+ /* Avoid dumping a truncated and misleading pc-relative offset. */
+ if (call_off > S16_MAX || call_off < S16_MIN)
+ insn->off = 0;
+ else
+ insn->off = call_off;
+}
+
static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
const struct cred *f_cred)
{
@@ -4944,6 +5084,9 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
}
if (code == (BPF_JMP | BPF_CALL) ||
code == (BPF_JMP | BPF_CALL_ARGS)) {
+ /* Restore the legacy xlated dump layout. */
+ if (insns[i].src_reg == BPF_PSEUDO_CALL)
+ prepare_dump_pseudo_call(&insns[i]);
if (code == (BPF_JMP | BPF_CALL_ARGS))
insns[i].code = BPF_JMP | BPF_CALL;
if (!bpf_dump_raw_ok(f_cred))
@@ -5019,10 +5162,11 @@ static int bpf_prog_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
struct bpf_prog_kstats stats;
char __user *uinsns;
- u32 ulen;
+ u32 ulen, len;
int err;
- err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
+ len = offsetofend(struct bpf_prog_info, attach_btf_id);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -5304,10 +5448,11 @@ static int bpf_map_get_info_by_fd(struct file *file,
{
struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_map_info info;
- u32 info_len = attr->info.info_len;
+ u32 info_len = attr->info.info_len, len;
int err;
- err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
+ len = offsetofend(struct bpf_map_info, hash_size);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -5345,18 +5490,16 @@ static int bpf_map_get_info_by_fd(struct file *file,
if (!map->ops->map_get_hash)
return -EINVAL;
-
- if (info.hash_size != SHA256_DIGEST_SIZE)
+ if (info.hash_size != sizeof(map->sha))
return -EINVAL;
-
if (!READ_ONCE(map->frozen))
return -EPERM;
- err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
+ err = map->ops->map_get_hash(map);
if (err != 0)
return err;
- if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0)
+ if (copy_to_user(uhash, map->sha, sizeof(map->sha)) != 0)
return -EFAULT;
} else if (info.hash_size) {
return -EINVAL;
@@ -5469,7 +5612,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log)
{
struct bpf_token *token = NULL;
@@ -5496,7 +5639,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_
bpf_token_put(token);
- return btf_new_fd(attr, uattr, uattr_size);
+ return btf_new_fd(attr, uattr, attr_log);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd
@@ -5697,7 +5840,7 @@ err_put:
return err;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
+#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.path_fd
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
struct bpf_prog *prog;
@@ -5748,6 +5891,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_iter_link_attach(attr, uattr, prog);
else if (prog->expected_attach_type == BPF_LSM_CGROUP)
ret = cgroup_bpf_link_attach(attr, prog);
+ else if (is_tracing_multi(prog->expected_attach_type))
+ ret = bpf_tracing_multi_attach(prog, attr);
else
ret = bpf_tracing_prog_attach(prog,
attr->link_create.target_fd,
@@ -6206,8 +6351,12 @@ put_prog:
return ret;
}
-static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
+ bpfptr_t uattr_common, unsigned int size_common)
{
+ struct bpf_common_attr attr_common;
+ u32 offsetof_log_true_size = 0;
+ struct bpf_log_attr attr_log;
union bpf_attr attr;
int err;
@@ -6221,13 +6370,29 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
+ memset(&attr_common, 0, sizeof(attr_common));
+ if (cmd & BPF_COMMON_ATTRS) {
+ err = bpf_check_uarg_tail_zero(uattr_common,
+ offsetofend(struct bpf_common_attr, log_true_size),
+ size_common);
+ if (err)
+ return err;
+
+ cmd &= ~BPF_COMMON_ATTRS;
+ size_common = min_t(u32, size_common, sizeof(attr_common));
+ if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0)
+ return -EFAULT;
+ } else {
+ size_common = 0;
+ }
+
err = security_bpf(cmd, &attr, size, uattr.is_kernel);
if (err < 0)
return err;
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr, uattr);
+ err = map_create(&attr, uattr, &attr_common, uattr_common, size_common);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
@@ -6245,7 +6410,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr, uattr, size);
+ if (size >= offsetofend(union bpf_attr, log_true_size))
+ offsetof_log_true_size = offsetof(union bpf_attr, log_true_size);
+ err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level,
+ offsetof_log_true_size, uattr, &attr_common, uattr_common,
+ size_common);
+ err = err ?: bpf_prog_load(&attr, uattr, &attr_log);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
@@ -6260,7 +6430,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
err = bpf_prog_detach(&attr);
break;
case BPF_PROG_QUERY:
- err = bpf_prog_query(&attr, uattr.user);
+ err = bpf_prog_query(&attr, uattr.user, size);
break;
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr.user);
@@ -6290,7 +6460,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr, uattr, size);
+ if (size >= offsetofend(union bpf_attr, btf_log_true_size))
+ offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size);
+ err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size,
+ attr.btf_log_level, offsetof_log_true_size, uattr,
+ &attr_common, uattr_common, size_common);
+ err = err ?: bpf_btf_load(&attr, uattr, &attr_log);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
@@ -6356,9 +6531,10 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
return err;
}
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size,
+ struct bpf_common_attr __user *, uattr_common, unsigned int, size_common)
{
- return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
+ return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common);
}
static bool syscall_prog_is_valid_access(int off, int size,
@@ -6388,7 +6564,7 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
default:
return -EINVAL;
}
- return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0);
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f02254a21585..1a721fc4bef5 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -30,8 +30,46 @@ static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline tables */
static DEFINE_MUTEX(trampoline_mutex);
+/*
+ * Keep 32 trampoline locks (5 bits) in the pool so trampoline_lock_all()
+ * stays below MAX_LOCK_DEPTH. Each pool slot has a distinct lockdep
+ * class because trampoline_lock_all() takes all pool mutexes at once;
+ * otherwise lockdep would report recursive locking on same-class mutexes.
+ */
+#define TRAMPOLINE_LOCKS_BITS 5
+#define TRAMPOLINE_LOCKS_TABLE_SIZE (1 << TRAMPOLINE_LOCKS_BITS)
+
+static struct {
+ struct mutex mutex;
+ struct lock_class_key key;
+} trampoline_locks[TRAMPOLINE_LOCKS_TABLE_SIZE];
+
+static struct mutex *select_trampoline_lock(struct bpf_trampoline *tr)
+{
+ return &trampoline_locks[hash_ptr(tr, TRAMPOLINE_LOCKS_BITS)].mutex;
+}
+
+static void trampoline_lock(struct bpf_trampoline *tr)
+{
+ mutex_lock(select_trampoline_lock(tr));
+}
+
+static void trampoline_unlock(struct bpf_trampoline *tr)
+{
+ mutex_unlock(select_trampoline_lock(tr));
+}
+
+struct bpf_trampoline_ops {
+ int (*register_fentry)(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *data);
+ int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *data);
+ int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im,
+ bool lock_direct_mutex, void *data);
+};
+
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex,
+ const struct bpf_trampoline_ops *ops, void *data);
+static const struct bpf_trampoline_ops trampoline_ops;
#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip)
@@ -69,9 +107,9 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
/* This is called inside register_ftrace_direct_multi(), so
- * tr->mutex is already locked.
+ * trampoline's mutex is already locked.
*/
- lockdep_assert_held_once(&tr->mutex);
+ lockdep_assert_held_once(select_trampoline_lock(tr));
/* Instead of updating the trampoline here, we propagate
* -EAGAIN to register_ftrace_direct(). Then we can
@@ -91,7 +129,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
}
/* The normal locking order is
- * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+ * select_trampoline_lock(tr) => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
*
* The following two commands are called from
*
@@ -99,12 +137,12 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
* cleanup_direct_functions_after_ipmodify
*
* In both cases, direct_mutex is already locked. Use
- * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
- * (something else is making changes to this same trampoline).
+ * mutex_trylock(select_trampoline_lock(tr)) to avoid deadlock in race condition
+ * (something else holds the same pool lock).
*/
- if (!mutex_trylock(&tr->mutex)) {
- /* sleep 1 ms to make sure whatever holding tr->mutex makes
- * some progress.
+ if (!mutex_trylock(select_trampoline_lock(tr))) {
+ /* sleep 1 ms to make sure whatever holding select_trampoline_lock(tr)
+ * makes some progress.
*/
msleep(1);
return -EAGAIN;
@@ -116,20 +154,22 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
!(tr->flags & BPF_TRAMP_F_ORIG_STACK))
- ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */,
+ &trampoline_ops, NULL);
break;
case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
- ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */,
+ &trampoline_ops, NULL);
break;
default:
ret = -EINVAL;
break;
}
- mutex_unlock(&tr->mutex);
+ trampoline_unlock(tr);
return ret;
}
#endif
@@ -142,7 +182,9 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
switch (ptype) {
case BPF_PROG_TYPE_TRACING:
if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
- eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION)
+ eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION ||
+ eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI ||
+ eatype == BPF_TRACE_FSESSION_MULTI)
return true;
return false;
case BPF_PROG_TYPE_LSM:
@@ -359,7 +401,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip)
head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)];
hlist_add_head(&tr->hlist_ip, head);
refcount_set(&tr->refcnt, 1);
- mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
out:
@@ -386,9 +427,11 @@ static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flag
return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
}
-static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
- void *old_addr)
+static void bpf_tramp_image_put(struct bpf_tramp_image *im);
+
+static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, void *data __maybe_unused)
{
+ void *old_addr = tr->cur_image->image;
int ret;
if (tr->func.ftrace_managed)
@@ -396,13 +439,19 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
else
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
- return ret;
+ if (ret)
+ return ret;
+
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = NULL;
+ return 0;
}
-static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
- void *old_addr, void *new_addr,
- bool lock_direct_mutex)
+static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im,
+ bool lock_direct_mutex, void *data __maybe_unused)
{
+ void *old_addr = tr->cur_image->image;
+ void *new_addr = im->image;
int ret;
if (tr->func.ftrace_managed) {
@@ -411,12 +460,20 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
new_addr);
}
- return ret;
+
+ if (ret)
+ return ret;
+
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = im;
+ return 0;
}
/* first time registering */
-static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
+static int register_fentry(struct bpf_trampoline *tr, struct bpf_tramp_image *im,
+ void *data __maybe_unused)
{
+ void *new_addr = im->image;
void *ip = tr->func.addr;
unsigned long faddr;
int ret;
@@ -434,33 +491,42 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
}
- return ret;
+ if (ret)
+ return ret;
+
+ tr->cur_image = im;
+ return 0;
}
-static struct bpf_tramp_links *
+static const struct bpf_trampoline_ops trampoline_ops = {
+ .register_fentry = register_fentry,
+ .unregister_fentry = unregister_fentry,
+ .modify_fentry = modify_fentry,
+};
+
+static struct bpf_tramp_nodes *
bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
{
- struct bpf_tramp_link *link;
- struct bpf_tramp_links *tlinks;
- struct bpf_tramp_link **links;
+ struct bpf_tramp_node *node, **nodes;
+ struct bpf_tramp_nodes *tnodes;
int kind;
*total = 0;
- tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX);
- if (!tlinks)
+ tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX);
+ if (!tnodes)
return ERR_PTR(-ENOMEM);
for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
- tlinks[kind].nr_links = tr->progs_cnt[kind];
+ tnodes[kind].nr_nodes = tr->progs_cnt[kind];
*total += tr->progs_cnt[kind];
- links = tlinks[kind].links;
+ nodes = tnodes[kind].nodes;
- hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
- *ip_arg |= link->link.prog->call_get_func_ip;
- *links++ = link;
+ hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) {
+ *ip_arg |= node->link->prog->call_get_func_ip;
+ *nodes++ = node;
}
}
- return tlinks;
+ return tnodes;
}
static void bpf_tramp_image_free(struct bpf_tramp_image *im)
@@ -604,30 +670,29 @@ out:
return ERR_PTR(err);
}
-static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex,
+ const struct bpf_trampoline_ops *ops, void *data)
{
struct bpf_tramp_image *im;
- struct bpf_tramp_links *tlinks;
+ struct bpf_tramp_nodes *tnodes;
u32 orig_flags = tr->flags;
bool ip_arg = false;
int err, total, size;
- tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
- if (IS_ERR(tlinks))
- return PTR_ERR(tlinks);
+ tnodes = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+ if (IS_ERR(tnodes))
+ return PTR_ERR(tnodes);
if (total == 0) {
- err = unregister_fentry(tr, orig_flags, tr->cur_image->image);
- bpf_tramp_image_put(tr->cur_image);
- tr->cur_image = NULL;
+ err = ops->unregister_fentry(tr, orig_flags, data);
goto out;
}
/* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
- if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
- tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+ if (tnodes[BPF_TRAMP_FEXIT].nr_nodes ||
+ tnodes[BPF_TRAMP_MODIFY_RETURN].nr_nodes) {
/* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
* should not be set together.
*/
@@ -658,7 +723,7 @@ again:
#endif
size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
- tlinks, tr->func.addr);
+ tnodes, tr->func.addr);
if (size < 0) {
err = size;
goto out;
@@ -676,7 +741,7 @@ again:
}
err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
- &tr->func.model, tr->flags, tlinks,
+ &tr->func.model, tr->flags, tnodes,
tr->func.addr);
if (err < 0)
goto out_free;
@@ -685,14 +750,12 @@ again:
if (err)
goto out_free;
- WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, orig_flags, tr->cur_image->image,
- im->image, lock_direct_mutex);
+ err = ops->modify_fentry(tr, orig_flags, im, lock_direct_mutex, data);
else
/* first time registering */
- err = register_fentry(tr, im->image);
+ err = ops->register_fentry(tr, im, data);
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
if (err == -EAGAIN) {
@@ -704,34 +767,31 @@ again:
goto again;
}
#endif
- if (err)
- goto out_free;
- if (tr->cur_image)
- bpf_tramp_image_put(tr->cur_image);
- tr->cur_image = im;
+out_free:
+ if (err)
+ bpf_tramp_image_free(im);
out:
/* If any error happens, restore previous flags */
if (err)
tr->flags = orig_flags;
- kfree(tlinks);
+ kfree(tnodes);
return err;
-
-out_free:
- bpf_tramp_image_free(im);
- goto out;
}
static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
{
switch (prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FENTRY_MULTI:
return BPF_TRAMP_FENTRY;
case BPF_MODIFY_RETURN:
return BPF_TRAMP_MODIFY_RETURN;
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FEXIT_MULTI:
return BPF_TRAMP_FEXIT;
case BPF_TRACE_FSESSION:
+ case BPF_TRACE_FSESSION_MULTI:
return BPF_TRAMP_FSESSION;
case BPF_LSM_MAC:
if (!prog->aux->attach_func_proto->type)
@@ -764,39 +824,33 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
return 0;
}
-static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
- struct bpf_trampoline *tr,
- struct bpf_prog *tgt_prog)
+static struct bpf_tramp_node *fsession_exit(struct bpf_tramp_node *node)
{
- struct bpf_fsession_link *fslink = NULL;
- enum bpf_tramp_prog_type kind;
- struct bpf_tramp_link *link_exiting;
- struct hlist_head *prog_list;
- int err = 0;
- int cnt = 0, i;
+ if (node->link->type == BPF_LINK_TYPE_TRACING) {
+ struct bpf_tracing_link *link;
- kind = bpf_attach_type_to_tramp(link->link.prog);
- if (tr->extension_prog)
- /* cannot attach fentry/fexit if extension prog is attached.
- * cannot overwrite extension prog either.
- */
- return -EBUSY;
+ link = container_of(node->link, struct bpf_tracing_link, link.link);
+ return &link->fexit;
+ } else if (node->link->type == BPF_LINK_TYPE_TRACING_MULTI) {
+ struct bpf_tracing_multi_link *link;
+ struct bpf_tracing_multi_node *mnode;
- for (i = 0; i < BPF_TRAMP_MAX; i++)
- cnt += tr->progs_cnt[i];
-
- if (kind == BPF_TRAMP_REPLACE) {
- /* Cannot attach extension if fentry/fexit are in use. */
- if (cnt)
- return -EBUSY;
- err = bpf_freplace_check_tgt_prog(tgt_prog);
- if (err)
- return err;
- tr->extension_prog = link->link.prog;
- return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
- BPF_MOD_JUMP, NULL,
- link->link.prog->bpf_func);
+ link = container_of(node->link, struct bpf_tracing_multi_link, link);
+ mnode = container_of(node, struct bpf_tracing_multi_node, node);
+ return &link->fexits[mnode - link->nodes];
}
+ return NULL;
+}
+
+static int bpf_trampoline_add_prog(struct bpf_trampoline *tr,
+ struct bpf_tramp_node *node,
+ int cnt)
+{
+ enum bpf_tramp_prog_type kind;
+ struct bpf_tramp_node *node_existing, *fexit;
+ struct hlist_head *prog_list;
+
+ kind = bpf_attach_type_to_tramp(node->link->prog);
if (kind == BPF_TRAMP_FSESSION) {
prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY];
cnt++;
@@ -805,59 +859,112 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
}
if (cnt >= BPF_MAX_TRAMP_LINKS)
return -E2BIG;
- if (!hlist_unhashed(&link->tramp_hlist))
+ if (!hlist_unhashed(&node->tramp_hlist))
/* prog already linked */
return -EBUSY;
- hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) {
- if (link_exiting->link.prog != link->link.prog)
+ hlist_for_each_entry(node_existing, prog_list, tramp_hlist) {
+ if (node_existing->link->prog != node->link->prog)
continue;
/* prog already linked */
return -EBUSY;
}
- hlist_add_head(&link->tramp_hlist, prog_list);
+ hlist_add_head(&node->tramp_hlist, prog_list);
if (kind == BPF_TRAMP_FSESSION) {
tr->progs_cnt[BPF_TRAMP_FENTRY]++;
- fslink = container_of(link, struct bpf_fsession_link, link.link);
- hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
+ fexit = fsession_exit(node);
+ if (WARN_ON_ONCE(!fexit))
+ return -EINVAL;
+ hlist_add_head(&fexit->tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
tr->progs_cnt[BPF_TRAMP_FEXIT]++;
} else {
tr->progs_cnt[kind]++;
}
- err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
- if (err) {
- hlist_del_init(&link->tramp_hlist);
- if (kind == BPF_TRAMP_FSESSION) {
- tr->progs_cnt[BPF_TRAMP_FENTRY]--;
- hlist_del_init(&fslink->fexit.tramp_hlist);
- tr->progs_cnt[BPF_TRAMP_FEXIT]--;
- } else {
- tr->progs_cnt[kind]--;
- }
+ return 0;
+}
+
+static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr,
+ struct bpf_tramp_node *node)
+{
+ enum bpf_tramp_prog_type kind;
+ struct bpf_tramp_node *fexit;
+
+ kind = bpf_attach_type_to_tramp(node->link->prog);
+ if (kind == BPF_TRAMP_FSESSION) {
+ fexit = fsession_exit(node);
+ if (WARN_ON_ONCE(!fexit))
+ return;
+ hlist_del_init(&fexit->tramp_hlist);
+ tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+ kind = BPF_TRAMP_FENTRY;
}
+ hlist_del_init(&node->tramp_hlist);
+ tr->progs_cnt[kind]--;
+}
+
+static int __bpf_trampoline_link_prog(struct bpf_tramp_node *node,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog,
+ const struct bpf_trampoline_ops *ops,
+ void *data)
+{
+ enum bpf_tramp_prog_type kind;
+ int err = 0;
+ int cnt = 0, i;
+
+ kind = bpf_attach_type_to_tramp(node->link->prog);
+ if (tr->extension_prog)
+ /* cannot attach fentry/fexit if extension prog is attached.
+ * cannot overwrite extension prog either.
+ */
+ return -EBUSY;
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ cnt += tr->progs_cnt[i];
+
+ if (kind == BPF_TRAMP_REPLACE) {
+ /* Cannot attach extension if fentry/fexit are in use. */
+ if (cnt)
+ return -EBUSY;
+ err = bpf_freplace_check_tgt_prog(tgt_prog);
+ if (err)
+ return err;
+ tr->extension_prog = node->link->prog;
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
+ node->link->prog->bpf_func);
+ }
+ err = bpf_trampoline_add_prog(tr, node, cnt);
+ if (err)
+ return err;
+ err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data);
+ if (err)
+ bpf_trampoline_remove_prog(tr, node);
return err;
}
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+int bpf_trampoline_link_prog(struct bpf_tramp_node *node,
struct bpf_trampoline *tr,
struct bpf_prog *tgt_prog)
{
int err;
- mutex_lock(&tr->mutex);
- err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
- mutex_unlock(&tr->mutex);
+ trampoline_lock(tr);
+ err = __bpf_trampoline_link_prog(node, tr, tgt_prog, &trampoline_ops, NULL);
+ trampoline_unlock(tr);
return err;
}
-static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_node *node,
struct bpf_trampoline *tr,
- struct bpf_prog *tgt_prog)
+ struct bpf_prog *tgt_prog,
+ const struct bpf_trampoline_ops *ops,
+ void *data)
{
enum bpf_tramp_prog_type kind;
int err;
- kind = bpf_attach_type_to_tramp(link->link.prog);
+ kind = bpf_attach_type_to_tramp(node->link->prog);
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
@@ -867,29 +974,21 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
guard(mutex)(&tgt_prog->aux->ext_mutex);
tgt_prog->aux->is_extended = false;
return err;
- } else if (kind == BPF_TRAMP_FSESSION) {
- struct bpf_fsession_link *fslink =
- container_of(link, struct bpf_fsession_link, link.link);
-
- hlist_del_init(&fslink->fexit.tramp_hlist);
- tr->progs_cnt[BPF_TRAMP_FEXIT]--;
- kind = BPF_TRAMP_FENTRY;
}
- hlist_del_init(&link->tramp_hlist);
- tr->progs_cnt[kind]--;
- return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+ bpf_trampoline_remove_prog(tr, node);
+ return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data);
}
/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node,
struct bpf_trampoline *tr,
struct bpf_prog *tgt_prog)
{
int err;
- mutex_lock(&tr->mutex);
- err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
- mutex_unlock(&tr->mutex);
+ trampoline_lock(tr);
+ err = __bpf_trampoline_unlink_prog(node, tr, tgt_prog, &trampoline_ops, NULL);
+ trampoline_unlock(tr);
return err;
}
@@ -903,7 +1002,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link)
if (!shim_link->trampoline)
return;
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link.node, shim_link->trampoline, NULL));
bpf_trampoline_put(shim_link->trampoline);
}
@@ -949,8 +1048,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog
p->type = BPF_PROG_TYPE_LSM;
p->expected_attach_type = BPF_LSM_MAC;
bpf_prog_inc(p);
- bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
- &bpf_shim_tramp_link_lops, p, attach_type);
+ bpf_tramp_link_init(&shim_link->link, BPF_LINK_TYPE_UNSPEC,
+ &bpf_shim_tramp_link_lops, p, attach_type, 0);
bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
return shim_link;
@@ -959,15 +1058,15 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog
static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
bpf_func_t bpf_func)
{
- struct bpf_tramp_link *link;
+ struct bpf_tramp_node *node;
int kind;
for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
- hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
- struct bpf_prog *p = link->link.prog;
+ hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) {
+ struct bpf_prog *p = node->link->prog;
if (p->bpf_func == bpf_func)
- return container_of(link, struct bpf_shim_tramp_link, link);
+ return container_of(node, struct bpf_shim_tramp_link, link.node);
}
}
@@ -999,12 +1098,12 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
if (!tr)
return -ENOMEM;
- mutex_lock(&tr->mutex);
+ trampoline_lock(tr);
shim_link = cgroup_shim_find(tr, bpf_func);
if (shim_link && !IS_ERR(bpf_link_inc_not_zero(&shim_link->link.link))) {
/* Reusing existing shim attached by the other program. */
- mutex_unlock(&tr->mutex);
+ trampoline_unlock(tr);
bpf_trampoline_put(tr); /* bpf_trampoline_get above */
return 0;
}
@@ -1017,23 +1116,23 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
goto err;
}
- err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
+ err = __bpf_trampoline_link_prog(&shim_link->link.node, tr, NULL, &trampoline_ops, NULL);
if (err)
goto err;
shim_link->trampoline = tr;
/* note, we're still holding tr refcnt from above */
- mutex_unlock(&tr->mutex);
+ trampoline_unlock(tr);
return 0;
err:
- mutex_unlock(&tr->mutex);
+ trampoline_unlock(tr);
if (shim_link)
bpf_link_put(&shim_link->link.link);
- /* have to release tr while _not_ holding its mutex */
+ /* have to release tr while _not_ holding pool mutex for trampoline */
bpf_trampoline_put(tr); /* bpf_trampoline_get above */
return err;
@@ -1054,9 +1153,9 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
if (WARN_ON_ONCE(!tr))
return;
- mutex_lock(&tr->mutex);
+ trampoline_lock(tr);
shim_link = cgroup_shim_find(tr, bpf_func);
- mutex_unlock(&tr->mutex);
+ trampoline_unlock(tr);
if (shim_link)
bpf_link_put(&shim_link->link.link);
@@ -1074,14 +1173,14 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
if (!tr)
return NULL;
- mutex_lock(&tr->mutex);
+ trampoline_lock(tr);
if (tr->func.addr)
goto out;
memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
tr->func.addr = (void *)tgt_info->tgt_addr;
out:
- mutex_unlock(&tr->mutex);
+ trampoline_unlock(tr);
return tr;
}
@@ -1094,7 +1193,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
mutex_lock(&trampoline_mutex);
if (!refcount_dec_and_test(&tr->refcnt))
goto out;
- WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
for (i = 0; i < BPF_TRAMP_MAX; i++)
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
@@ -1333,7 +1431,7 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
int __weak
arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_links *tlinks,
+ struct bpf_tramp_nodes *tnodes,
void *func_addr)
{
return -ENOTSUPP;
@@ -1367,11 +1465,288 @@ int __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
}
int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_links *tlinks, void *func_addr)
+ struct bpf_tramp_nodes *tnodes, void *func_addr)
{
return -ENOTSUPP;
}
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
+ defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) && \
+ defined(CONFIG_BPF_SYSCALL)
+
+static void trampoline_lock_all(void)
+{
+ int i;
+
+ for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++)
+ mutex_lock(&trampoline_locks[i].mutex);
+}
+
+static void trampoline_unlock_all(void)
+{
+ int i;
+
+ for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++)
+ mutex_unlock(&trampoline_locks[i].mutex);
+}
+
+static void remove_tracing_multi_data(struct bpf_tracing_multi_data *data)
+{
+ ftrace_hash_remove(data->reg);
+ ftrace_hash_remove(data->unreg);
+ ftrace_hash_remove(data->modify);
+}
+
+static void clear_tracing_multi_data(struct bpf_tracing_multi_data *data)
+{
+ remove_tracing_multi_data(data);
+
+ free_ftrace_hash(data->reg);
+ free_ftrace_hash(data->unreg);
+ free_ftrace_hash(data->modify);
+}
+
+static int init_tracing_multi_data(struct bpf_tracing_multi_data *data)
+{
+ data->reg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ data->unreg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ data->modify = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+
+ if (!data->reg || !data->unreg || !data->modify) {
+ clear_tracing_multi_data(data);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void ftrace_hash_add(struct ftrace_hash *hash, struct ftrace_func_entry *entry,
+ unsigned long ip, unsigned long direct)
+{
+ entry->ip = ip;
+ entry->direct = direct;
+ add_ftrace_hash_entry(hash, entry);
+}
+
+static int register_fentry_multi(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *ptr)
+{
+ unsigned long addr = (unsigned long) im->image;
+ unsigned long ip = ftrace_location(tr->ip);
+ struct bpf_tracing_multi_data *data = ptr;
+
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+
+ ftrace_hash_add(data->reg, data->entry, ip, addr);
+ tr->cur_image = im;
+ return 0;
+}
+
+static int unregister_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, void *ptr)
+{
+ unsigned long addr = (unsigned long) tr->cur_image->image;
+ unsigned long ip = ftrace_location(tr->ip);
+ struct bpf_tracing_multi_data *data = ptr;
+
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+
+ ftrace_hash_add(data->unreg, data->entry, ip, addr);
+ tr->cur_image = NULL;
+ return 0;
+}
+
+static int modify_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im,
+ bool lock_direct_mutex, void *ptr)
+{
+ unsigned long addr = (unsigned long) im->image;
+ unsigned long ip = ftrace_location(tr->ip);
+ struct bpf_tracing_multi_data *data = ptr;
+
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+
+ ftrace_hash_add(data->modify, data->entry, ip, addr);
+ tr->cur_image = im;
+ return 0;
+}
+
+static const struct bpf_trampoline_ops trampoline_multi_ops = {
+ .register_fentry = register_fentry_multi,
+ .unregister_fentry = unregister_fentry_multi,
+ .modify_fentry = modify_fentry_multi,
+};
+
+static void bpf_trampoline_multi_attach_init(struct bpf_trampoline *tr)
+{
+ tr->multi_attach.old_image = tr->cur_image;
+ tr->multi_attach.old_flags = tr->flags;
+}
+
+static void bpf_trampoline_multi_attach_free(struct bpf_trampoline *tr)
+{
+ if (tr->multi_attach.old_image)
+ bpf_tramp_image_put(tr->multi_attach.old_image);
+
+ tr->multi_attach.old_image = NULL;
+ tr->multi_attach.old_flags = 0;
+}
+
+static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr)
+{
+ if (tr->cur_image)
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = tr->multi_attach.old_image;
+ tr->flags = tr->multi_attach.old_flags;
+
+ tr->multi_attach.old_image = NULL;
+ tr->multi_attach.old_flags = 0;
+}
+
+#define for_each_mnode_cnt(mnode, link, cnt) \
+ for (i = 0, mnode = &link->nodes[i]; i < cnt; i++, mnode = &link->nodes[i])
+
+#define for_each_mnode(mnode, link) \
+ for_each_mnode_cnt(mnode, link, link->nodes_cnt)
+
+int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids,
+ struct bpf_tracing_multi_link *link)
+{
+ struct bpf_tracing_multi_data *data = &link->data;
+ struct bpf_attach_target_info tgt_info = {};
+ struct btf *btf = prog->aux->attach_btf;
+ struct bpf_tracing_multi_node *mnode;
+ struct bpf_trampoline *tr;
+ int i, err, rollback_cnt;
+ u64 key;
+
+ for_each_mnode(mnode, link) {
+ rollback_cnt = i;
+
+ err = bpf_check_attach_btf_id_multi(btf, prog, ids[i], &tgt_info);
+ if (err)
+ goto rollback_put;
+
+ key = bpf_trampoline_compute_key(NULL, btf, ids[i]);
+
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr) {
+ err = -ENOMEM;
+ goto rollback_put;
+ }
+
+ mnode->trampoline = tr;
+ mnode->node.link = &link->link;
+ mnode->node.cookie = link->cookies ? link->cookies[i] : 0;
+
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) {
+ link->fexits[i].link = &link->link;
+ link->fexits[i].cookie = link->cookies ? link->cookies[i] : 0;
+ }
+
+ cond_resched();
+ }
+
+ err = init_tracing_multi_data(data);
+ if (err) {
+ rollback_cnt = link->nodes_cnt;
+ goto rollback_put;
+ }
+
+ trampoline_lock_all();
+
+ for_each_mnode(mnode, link) {
+ bpf_trampoline_multi_attach_init(mnode->trampoline);
+
+ data->entry = &mnode->entry;
+ err = __bpf_trampoline_link_prog(&mnode->node, mnode->trampoline, NULL,
+ &trampoline_multi_ops, data);
+ if (err) {
+ rollback_cnt = i;
+ goto rollback_unlink;
+ }
+ }
+
+ rollback_cnt = link->nodes_cnt;
+ if (ftrace_hash_count(data->reg)) {
+ err = update_ftrace_direct_add(&direct_ops, data->reg);
+ if (err)
+ goto rollback_unlink;
+ }
+
+ if (ftrace_hash_count(data->modify)) {
+ err = update_ftrace_direct_mod(&direct_ops, data->modify, true);
+ if (err) {
+ if (ftrace_hash_count(data->reg))
+ WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg));
+ goto rollback_unlink;
+ }
+ }
+
+ for_each_mnode(mnode, link)
+ bpf_trampoline_multi_attach_free(mnode->trampoline);
+
+ trampoline_unlock_all();
+
+ remove_tracing_multi_data(data);
+ return 0;
+
+rollback_unlink:
+ for_each_mnode_cnt(mnode, link, rollback_cnt) {
+ bpf_trampoline_remove_prog(mnode->trampoline, &mnode->node);
+ bpf_trampoline_multi_attach_rollback(mnode->trampoline);
+ }
+
+ trampoline_unlock_all();
+
+ clear_tracing_multi_data(data);
+ rollback_cnt = link->nodes_cnt;
+
+rollback_put:
+ for_each_mnode_cnt(mnode, link, rollback_cnt)
+ bpf_trampoline_put(mnode->trampoline);
+
+ return err;
+}
+
+int bpf_trampoline_multi_detach(struct bpf_prog *prog, struct bpf_tracing_multi_link *link)
+{
+ struct bpf_tracing_multi_data *data = &link->data;
+ struct bpf_tracing_multi_node *mnode;
+ int i;
+
+ trampoline_lock_all();
+
+ for_each_mnode(mnode, link) {
+ data->entry = &mnode->entry;
+ bpf_trampoline_multi_attach_init(mnode->trampoline);
+ WARN_ON_ONCE(__bpf_trampoline_unlink_prog(&mnode->node, mnode->trampoline,
+ NULL, &trampoline_multi_ops, data));
+ }
+
+ if (ftrace_hash_count(data->unreg))
+ WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->unreg));
+ if (ftrace_hash_count(data->modify))
+ WARN_ON_ONCE(update_ftrace_direct_mod(&direct_ops, data->modify, true));
+
+ for_each_mnode(mnode, link)
+ bpf_trampoline_multi_attach_free(mnode->trampoline);
+
+ trampoline_unlock_all();
+
+ for_each_mnode(mnode, link)
+ bpf_trampoline_put(mnode->trampoline);
+
+ clear_tracing_multi_data(data);
+ return 0;
+}
+
+#undef for_each_mnode_cnt
+#undef for_each_mnode
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS &&
+ CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS &&
+ CONFIG_BPF_SYSCALL */
+
static int __init init_trampolines(void)
{
int i;
@@ -1380,6 +1755,8 @@ static int __init init_trampolines(void)
INIT_HLIST_HEAD(&trampoline_key_table[i]);
for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&trampoline_ip_table[i]);
+ for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++)
+ __mutex_init(&trampoline_locks[i].mutex, "trampoline_lock", &trampoline_locks[i].key);
return 0;
}
late_initcall(init_trampolines);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 69d75515ed3f..2abc79dbf281 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -26,6 +26,7 @@
#include <linux/poison.h>
#include <linux/module.h>
#include <linux/cpumask.h>
+#include <linux/cnum.h>
#include <linux/bpf_mem_alloc.h>
#include <net/xdp.h>
#include <linux/trace_events.h>
@@ -199,14 +200,15 @@ struct bpf_verifier_stack_elem {
#define BPF_PRIV_STACK_MIN_SIZE 64
-static int acquire_reference(struct bpf_verifier_env *env, int insn_idx);
-static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id);
-static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id);
+static int release_reference_nomark(struct bpf_verifier_state *state, int id);
+static int release_reference(struct bpf_verifier_env *env, int id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
+static bool is_tracing_prog_type(enum bpf_prog_type type);
static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
-static bool is_trusted_reg(const struct bpf_reg_state *reg);
+static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg);
static inline bool in_sleepable_context(struct bpf_verifier_env *env);
static const char *non_sleepable_context_description(struct bpf_verifier_env *env);
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg);
@@ -230,8 +232,28 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
+static void update_ref_obj(struct ref_obj_desc *ref_obj, struct bpf_reg_state *reg)
+{
+ ref_obj->id = reg->id;
+ ref_obj->parent_id = reg->parent_id;
+ ref_obj->cnt++;
+}
+
+static int validate_ref_obj(struct bpf_verifier_env *env, struct ref_obj_desc *ref_obj)
+{
+ if (ref_obj->cnt > 1) {
+ verifier_bug(env, "function expects only one referenced object but got %d\n",
+ ref_obj->cnt);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
struct bpf_call_arg_meta {
struct bpf_map_desc map;
+ struct bpf_dynptr_desc dynptr;
+ struct ref_obj_desc ref_obj;
bool raw_mode;
bool pkt_access;
u8 release_regno;
@@ -239,8 +261,6 @@ struct bpf_call_arg_meta {
int access_size;
int mem_size;
u64 msize_max_value;
- int ref_obj_id;
- int dynptr_id;
int func_id;
struct btf *btf;
u32 btf_id;
@@ -261,6 +281,41 @@ struct bpf_kfunc_meta {
struct btf *btf_vmlinux;
+typedef struct argno {
+ int argno;
+} argno_t;
+
+static argno_t argno_from_reg(u32 regno)
+{
+ return (argno_t){ .argno = regno };
+}
+
+static argno_t argno_from_arg(u32 arg)
+{
+ return (argno_t){ .argno = -arg };
+}
+
+static int reg_from_argno(argno_t a)
+{
+ if (a.argno >= 0)
+ return a.argno;
+ if (a.argno >= -MAX_BPF_FUNC_REG_ARGS)
+ return -a.argno;
+ return -1;
+}
+
+static int arg_from_argno(argno_t a)
+{
+ if (a.argno < 0)
+ return -a.argno;
+ return -1;
+}
+
+static int arg_idx_from_argno(argno_t a)
+{
+ return arg_from_argno(a) - 1;
+}
+
static const char *btf_type_name(const struct btf *btf, u32 id)
{
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
@@ -290,12 +345,12 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
bool unknown = true;
verbose(env, "%s the register %s has", ctx, reg_name);
- if (reg->smin_value > S64_MIN) {
- verbose(env, " smin=%lld", reg->smin_value);
+ if (reg_smin(reg) > S64_MIN) {
+ verbose(env, " smin=%lld", reg_smin(reg));
unknown = false;
}
- if (reg->smax_value < S64_MAX) {
- verbose(env, " smax=%lld", reg->smax_value);
+ if (reg_smax(reg) < S64_MAX) {
+ verbose(env, " smax=%lld", reg_smax(reg));
unknown = false;
}
if (unknown)
@@ -303,7 +358,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}
-static bool reg_not_null(const struct bpf_reg_state *reg)
+static bool reg_not_null(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
{
enum bpf_reg_type type;
@@ -317,7 +372,7 @@ static bool reg_not_null(const struct bpf_reg_state *reg)
type == PTR_TO_MAP_VALUE ||
type == PTR_TO_MAP_KEY ||
type == PTR_TO_SOCK_COMMON ||
- (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
+ (type == PTR_TO_BTF_ID && is_trusted_reg(env, reg)) ||
(type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) ||
type == CONST_PTR_TO_MAP;
}
@@ -434,15 +489,9 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
-static bool is_dynptr_ref_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_dynptr_data;
-}
-
static bool is_sync_callback_calling_kfunc(u32 btf_id);
static bool is_async_callback_calling_kfunc(u32 btf_id);
static bool is_callback_calling_kfunc(u32 btf_id);
-static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
static bool is_bpf_wq_set_callback_kfunc(u32 btf_id);
static bool is_task_work_add_kfunc(u32 func_id);
@@ -498,22 +547,6 @@ bool bpf_is_may_goto_insn(struct bpf_insn *insn)
return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
}
-static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
- const struct bpf_map *map)
-{
- int ref_obj_uses = 0;
-
- if (is_ptr_cast_function(func_id))
- ref_obj_uses++;
- if (is_acquire_function(func_id, map))
- ref_obj_uses++;
- if (is_dynptr_ref_function(func_id))
- ref_obj_uses++;
-
- return ref_obj_uses > 1;
-}
-
-
static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
@@ -610,43 +643,44 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
}
}
-static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+static bool dynptr_type_referenced(enum bpf_dynptr_type type)
{
return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE;
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
enum bpf_dynptr_type type,
- bool first_slot, int dynptr_id);
+ bool first_slot, int id, int parent_id);
static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *sreg1,
struct bpf_reg_state *sreg2,
- enum bpf_dynptr_type type)
+ enum bpf_dynptr_type type, int parent_id)
{
int id = ++env->id_gen;
- __mark_dynptr_reg(sreg1, type, true, id);
- __mark_dynptr_reg(sreg2, type, false, id);
+ __mark_dynptr_reg(sreg1, type, true, id, parent_id);
+ __mark_dynptr_reg(sreg2, type, false, id, parent_id);
}
static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
enum bpf_dynptr_type type)
{
- __mark_dynptr_reg(reg, type, true, ++env->id_gen);
+ __mark_dynptr_reg(reg, type, true, ++env->id_gen, 0);
}
static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
struct bpf_func_state *state, int spi);
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
+ enum bpf_arg_type arg_type, int insn_idx,
+ struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr)
{
struct bpf_func_state *state = bpf_func(env, reg);
+ int spi, i, err, parent_id = 0;
enum bpf_dynptr_type type;
- int spi, i, err;
spi = dynptr_get_spi(env, reg);
if (spi < 0)
@@ -677,94 +711,69 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (type == BPF_DYNPTR_TYPE_INVALID)
return -EINVAL;
- mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
- &state->stack[spi - 1].spilled_ptr, type);
+ if (dynptr->type == BPF_DYNPTR_TYPE_INVALID) { /* dynptr constructors */
+ err = validate_ref_obj(env, ref_obj);
+ if (err)
+ return err;
- if (dynptr_type_refcounted(type)) {
- /* The id is used to track proper releasing */
- int id;
+ /* Track parent's id if the parent is a referenced object */
+ parent_id = ref_obj->id;
- if (clone_ref_obj_id)
- id = clone_ref_obj_id;
- else
- id = acquire_reference(env, insn_idx);
+ if (dynptr_type_referenced(type)) {
+ int id;
- if (id < 0)
- return id;
+ /*
+ * Create an intermediate reference that tracks the referenced
+ * object for the referenced dynptr. Freeing a referenced dynptr
+ * through helpers/kfuncs will invalidate all clones.
+ */
+ id = acquire_reference(env, insn_idx, parent_id);
+ if (id < 0)
+ return id;
- state->stack[spi].spilled_ptr.ref_obj_id = id;
- state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
+ parent_id = id;
+ }
+ } else { /* bpf_dynptr_clone() */
+ parent_id = dynptr->parent_id;
}
+ mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
+ &state->stack[spi - 1].spilled_ptr, type, parent_id);
+
return 0;
}
-static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
+static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_stack_state *stack)
{
int i;
for (i = 0; i < BPF_REG_SIZE; i++) {
- state->stack[spi].slot_type[i] = STACK_INVALID;
- state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ stack[0].slot_type[i] = STACK_INVALID;
+ stack[1].slot_type[i] = STACK_INVALID;
}
- bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
- bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+ bpf_mark_reg_not_init(env, &stack[0].spilled_ptr);
+ bpf_mark_reg_not_init(env, &stack[1].spilled_ptr);
}
static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = bpf_func(env, reg);
- int spi, ref_obj_id, i;
+ int spi;
- /*
- * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
- * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
- * is safe to do directly.
- */
- if (reg->type == CONST_PTR_TO_DYNPTR) {
- verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
- return -EFAULT;
- }
spi = dynptr_get_spi(env, reg);
if (spi < 0)
return spi;
- if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
- invalidate_dynptr(env, state, spi);
- return 0;
- }
-
- ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
-
- /* If the dynptr has a ref_obj_id, then we need to invalidate
- * two things:
- *
- * 1) Any dynptrs with a matching ref_obj_id (clones)
- * 2) Any slices derived from this dynptr.
+ /*
+ * For referenced dynptr, release the parent ref which cascades to
+ * all clones and derived slices. For non-referenced dynptr, only
+ * the dynptr and slices derived from it will be invalidated.
*/
-
- /* Invalidate any slices associated with this dynptr */
- WARN_ON_ONCE(release_reference(env, ref_obj_id));
-
- /* Invalidate any dynptr clones */
- for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
- continue;
-
- /* it should always be the case that if the ref obj id
- * matches then the stack slot also belongs to a
- * dynptr
- */
- if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
- verifier_bug(env, "misconfigured ref_obj_id");
- return -EFAULT;
- }
- if (state->stack[i].spilled_ptr.dynptr.first_slot)
- invalidate_dynptr(env, state, i);
- }
-
- return 0;
+ reg = &state->stack[spi].spilled_ptr;
+ return release_reference(env, dynptr_type_referenced(reg->dynptr.type)
+ ? reg->parent_id
+ : reg->id);
}
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
@@ -778,12 +787,29 @@ static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_
__mark_reg_unknown(env, reg);
}
+static int dynptr_ref_cnt(struct bpf_verifier_env *env, int v_parent_id)
+{
+ struct bpf_stack_state *stack;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int ref_cnt = 0;
+
+ bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, 1 << STACK_DYNPTR, ({
+ if (!stack || stack->slot_type[0] != STACK_DYNPTR)
+ continue;
+ if (!stack->spilled_ptr.dynptr.first_slot)
+ continue;
+ if (stack->spilled_ptr.parent_id == v_parent_id)
+ ref_cnt++;
+ }));
+
+ return ref_cnt;
+}
+
static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
struct bpf_func_state *state, int spi)
{
- struct bpf_func_state *fstate;
- struct bpf_reg_state *dreg;
- int i, dynptr_id;
+ int err = 0;
/* We always ensure that STACK_DYNPTR is never set partially,
* hence just checking for slot_type[0] is enough. This is
@@ -797,56 +823,25 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
spi = spi + 1;
- if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
- int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
- int ref_cnt = 0;
-
- /*
- * A referenced dynptr can be overwritten only if there is at
- * least one other dynptr sharing the same ref_obj_id,
- * ensuring the reference can still be properly released.
- */
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] != STACK_DYNPTR)
- continue;
- if (!state->stack[i].spilled_ptr.dynptr.first_slot)
- continue;
- if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id)
- ref_cnt++;
- }
-
- if (ref_cnt <= 1) {
- verbose(env, "cannot overwrite referenced dynptr\n");
- return -EINVAL;
- }
+ /*
+ * A referenced dynptr can be overwritten only if there is at
+ * least one other dynptr sharing the same virtual ref parent,
+ * ensuring the reference can still be properly released.
+ */
+ if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type) &&
+ dynptr_ref_cnt(env, state->stack[spi].spilled_ptr.parent_id) <= 1) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
}
- mark_stack_slot_scratched(env, spi);
- mark_stack_slot_scratched(env, spi - 1);
-
- /* Writing partially to one dynptr stack slot destroys both. */
- for (i = 0; i < BPF_REG_SIZE; i++) {
- state->stack[spi].slot_type[i] = STACK_INVALID;
- state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ /* Invalidate the dynptr and any derived slices */
+ err = release_reference(env, state->stack[spi].spilled_ptr.id);
+ if (!err) {
+ mark_stack_slot_scratched(env, spi);
+ mark_stack_slot_scratched(env, spi - 1);
}
- dynptr_id = state->stack[spi].spilled_ptr.id;
- /* Invalidate any slices associated with this dynptr */
- bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
- /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
- if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
- continue;
- if (dreg->dynptr_id == dynptr_id)
- mark_reg_invalid(env, dreg);
- }));
-
- /* Do not release reference state, we are destroying dynptr on stack,
- * not using some helper to release it. Just reset register.
- */
- bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
- bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
-
- return 0;
+ return err;
}
static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
@@ -946,7 +941,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
if (spi < 0)
return spi;
- id = acquire_reference(env, insn_idx);
+ id = acquire_reference(env, insn_idx, 0);
if (id < 0)
return id;
@@ -962,7 +957,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
else
st->type |= PTR_UNTRUSTED;
}
- st->ref_obj_id = i == 0 ? id : 0;
+ st->id = i == 0 ? id : 0;
st->iter.btf = btf;
st->iter.btf_id = btf_id;
st->iter.state = BPF_ITER_STATE_ACTIVE;
@@ -992,7 +987,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
struct bpf_reg_state *st = &slot->spilled_ptr;
if (i == 0)
- WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+ WARN_ON_ONCE(release_reference(env, st->id));
bpf_mark_reg_not_init(env, st);
@@ -1048,10 +1043,10 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s
if (st->type & PTR_UNTRUSTED)
return -EPROTO;
- /* only main (first) slot has ref_obj_id set */
- if (i == 0 && !st->ref_obj_id)
+ /* only main (first) slot has id set */
+ if (i == 0 && !st->id)
return -EINVAL;
- if (i != 0 && st->ref_obj_id)
+ if (i != 0 && st->id)
return -EINVAL;
if (st->iter.btf != btf || st->iter.btf_id != btf_id)
return -EINVAL;
@@ -1090,7 +1085,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
- st->ref_obj_id = id;
+ st->id = id;
st->irq.kfunc_class = kfunc_class;
for (i = 0; i < BPF_REG_SIZE; i++)
@@ -1124,7 +1119,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
return -EINVAL;
}
- err = release_irq_state(env->cur_state, st->ref_obj_id);
+ err = release_irq_state(env->cur_state, st->id);
WARN_ON_ONCE(err && err != -EACCES);
if (err) {
int insn_idx = 0;
@@ -1188,7 +1183,7 @@ static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_r
slot = &state->stack[spi];
st = &slot->spilled_ptr;
- if (!st->ref_obj_id)
+ if (!st->id)
return -EINVAL;
for (i = 0; i < BPF_REG_SIZE; i++)
@@ -1340,6 +1335,18 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st
return -ENOMEM;
dst->allocated_stack = src->allocated_stack;
+
+ /* copy stack args state */
+ n = src->out_stack_arg_cnt;
+ if (n) {
+ dst->stack_arg_regs = copy_array(dst->stack_arg_regs, src->stack_arg_regs, n,
+ sizeof(struct bpf_reg_state),
+ GFP_KERNEL_ACCOUNT);
+ if (!dst->stack_arg_regs)
+ return -ENOMEM;
+ }
+
+ dst->out_stack_arg_cnt = src->out_stack_arg_cnt;
return 0;
}
@@ -1381,6 +1388,23 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state
return 0;
}
+static int grow_stack_arg_slots(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int cnt)
+{
+ size_t old_n = state->out_stack_arg_cnt;
+
+ if (old_n >= cnt)
+ return 0;
+
+ state->stack_arg_regs = realloc_array(state->stack_arg_regs, old_n, cnt,
+ sizeof(struct bpf_reg_state));
+ if (!state->stack_arg_regs)
+ return -ENOMEM;
+
+ state->out_stack_arg_cnt = cnt;
+ return 0;
+}
+
/* Acquire a pointer id from the env and update the state->refs to include
* this new pointer reference.
* On success, returns a valid pointer id to associate with the register
@@ -1400,7 +1424,7 @@ static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_e
return &state->refs[new_ofs];
}
-static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id)
{
struct bpf_reference_state *s;
@@ -1409,6 +1433,7 @@ static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
return -ENOMEM;
s->type = REF_TYPE_PTR;
s->id = ++env->id_gen;
+ s->parent_id = parent_id;
return s->id;
}
@@ -1465,17 +1490,25 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx)
return;
}
-static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id)
+static bool find_reference_state(struct bpf_verifier_state *state, int id)
{
int i;
- for (i = 0; i < state->acquired_refs; i++)
- if (state->refs[i].id == ptr_id)
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
+ if (state->refs[i].id == id)
return true;
+ }
return false;
}
+static bool reg_is_referenced(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
+{
+ return find_reference_state(env->cur_state, reg->id);
+}
+
static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
{
void *prev_ptr = NULL;
@@ -1543,6 +1576,7 @@ static void free_func_state(struct bpf_func_state *state)
{
if (!state)
return;
+ kfree(state->stack_arg_regs);
kfree(state->stack);
kfree(state);
}
@@ -1751,6 +1785,22 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
return &elem->st;
}
+static const char *reg_arg_name(struct bpf_verifier_env *env, argno_t argno)
+{
+ char *buf = env->tmp_arg_name;
+ int len = sizeof(env->tmp_arg_name);
+ int arg, regno = reg_from_argno(argno);
+
+ if (regno >= 0) {
+ snprintf(buf, len, "R%d", regno);
+ } else {
+ arg = arg_from_argno(argno);
+ snprintf(buf, len, "*(R11-%u)", (arg - MAX_BPF_FUNC_REG_ARGS) * BPF_REG_SIZE);
+ }
+
+ return buf;
+}
+
static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
@@ -1759,15 +1809,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
reg->var_off = tnum_const(imm);
- reg->smin_value = (s64)imm;
- reg->smax_value = (s64)imm;
- reg->umin_value = imm;
- reg->umax_value = imm;
-
- reg->s32_min_value = (s32)imm;
- reg->s32_max_value = (s32)imm;
- reg->u32_min_value = (u32)imm;
- reg->u32_max_value = (u32)imm;
+ reg->r64 = cnum64_from_urange(imm, imm);
+ reg->r32 = cnum32_from_urange((u32)imm, (u32)imm);
}
/* Mark the unknown part of a register (variable offset or scalar value) as
@@ -1779,17 +1822,14 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
reg->id = 0;
- reg->ref_obj_id = 0;
+ reg->parent_id = 0;
___mark_reg_known(reg, imm);
}
static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{
reg->var_off = tnum_const_subreg(reg->var_off, imm);
- reg->s32_min_value = (s32)imm;
- reg->s32_max_value = (s32)imm;
- reg->u32_min_value = (u32)imm;
- reg->u32_max_value = (u32)imm;
+ reg->r32 = cnum32_from_urange((u32)imm, (u32)imm);
}
/* Mark the 'variable offset' part of a register as zero. This should be
@@ -1817,7 +1857,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
- bool first_slot, int dynptr_id)
+ bool first_slot, int id, int parent_id)
{
/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
* callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
@@ -1826,7 +1866,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty
__mark_reg_known_zero(reg);
reg->type = CONST_PTR_TO_DYNPTR;
/* Give each dynptr a unique id to uniquely associate slices to it. */
- reg->id = dynptr_id;
+ reg->id = id;
+ reg->parent_id = parent_id;
reg->dynptr.type = type;
reg->dynptr.first_slot = first_slot;
}
@@ -1900,34 +1941,21 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
tnum_equals_const(reg->var_off, 0);
}
-/* Reset the min/max bounds of a register */
-static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
{
- reg->smin_value = S64_MIN;
- reg->smax_value = S64_MAX;
- reg->umin_value = 0;
- reg->umax_value = U64_MAX;
-
- reg->s32_min_value = S32_MIN;
- reg->s32_max_value = S32_MAX;
- reg->u32_min_value = 0;
- reg->u32_max_value = U32_MAX;
+ reg->r32 = CNUM32_UNBOUNDED;
}
static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
{
- reg->smin_value = S64_MIN;
- reg->smax_value = S64_MAX;
- reg->umin_value = 0;
- reg->umax_value = U64_MAX;
+ reg->r64 = CNUM64_UNBOUNDED;
}
-static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
- reg->s32_min_value = S32_MIN;
- reg->s32_max_value = S32_MAX;
- reg->u32_min_value = 0;
- reg->u32_max_value = U32_MAX;
+ __mark_reg64_unbounded(reg);
+ __mark_reg32_unbounded(reg);
}
static void reset_reg64_and_tnum(struct bpf_reg_state *reg)
@@ -1942,19 +1970,32 @@ static void reset_reg32_and_tnum(struct bpf_reg_state *reg)
reg->var_off = tnum_unknown;
}
-static void __update_reg32_bounds(struct bpf_reg_state *reg)
+static struct cnum32 cnum32_from_tnum(struct tnum tnum)
{
- struct tnum var32_off = tnum_subreg(reg->var_off);
+ tnum = tnum_subreg(tnum);
+ if ((tnum.mask & S32_MIN) || (tnum.value & S32_MIN))
+ /* min signed is max(sign bit) | min(other bits) */
+ /* max signed is min(sign bit) | max(other bits) */
+ return cnum32_from_srange(tnum.value | (tnum.mask & S32_MIN),
+ tnum.value | (tnum.mask & S32_MAX));
+ else
+ return cnum32_from_urange(tnum.value, (tnum.value | tnum.mask));
+}
- /* min signed is max(sign bit) | min(other bits) */
- reg->s32_min_value = max_t(s32, reg->s32_min_value,
- var32_off.value | (var32_off.mask & S32_MIN));
- /* max signed is min(sign bit) | max(other bits) */
- reg->s32_max_value = min_t(s32, reg->s32_max_value,
- var32_off.value | (var32_off.mask & S32_MAX));
- reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
- reg->u32_max_value = min(reg->u32_max_value,
- (u32)(var32_off.value | var32_off.mask));
+static struct cnum64 cnum64_from_tnum(struct tnum tnum)
+{
+ if ((tnum.mask & S64_MIN) || (tnum.value & S64_MIN))
+ /* min signed is max(sign bit) | min(other bits) */
+ /* max signed is min(sign bit) | max(other bits) */
+ return cnum64_from_srange(tnum.value | (tnum.mask & S64_MIN),
+ tnum.value | (tnum.mask & S64_MAX));
+ else
+ return cnum64_from_urange(tnum.value, (tnum.value | tnum.mask));
+}
+
+static void __update_reg32_bounds(struct bpf_reg_state *reg)
+{
+ cnum32_intersect_with(&reg->r32, cnum32_from_tnum(reg->var_off));
}
static void __update_reg64_bounds(struct bpf_reg_state *reg)
@@ -1962,26 +2003,18 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg)
u64 tnum_next, tmax;
bool umin_in_tnum;
- /* min signed is max(sign bit) | min(other bits) */
- reg->smin_value = max_t(s64, reg->smin_value,
- reg->var_off.value | (reg->var_off.mask & S64_MIN));
- /* max signed is min(sign bit) | max(other bits) */
- reg->smax_value = min_t(s64, reg->smax_value,
- reg->var_off.value | (reg->var_off.mask & S64_MAX));
- reg->umin_value = max(reg->umin_value, reg->var_off.value);
- reg->umax_value = min(reg->umax_value,
- reg->var_off.value | reg->var_off.mask);
+ cnum64_intersect_with(&reg->r64, cnum64_from_tnum(reg->var_off));
/* Check if u64 and tnum overlap in a single value */
- tnum_next = tnum_step(reg->var_off, reg->umin_value);
- umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value;
+ tnum_next = tnum_step(reg->var_off, reg_umin(reg));
+ umin_in_tnum = (reg_umin(reg) & ~reg->var_off.mask) == reg->var_off.value;
tmax = reg->var_off.value | reg->var_off.mask;
- if (umin_in_tnum && tnum_next > reg->umax_value) {
+ if (umin_in_tnum && tnum_next > reg_umax(reg)) {
/* The u64 range and the tnum only overlap in umin.
* u64: ---[xxxxxx]-----
* tnum: --xx----------x-
*/
- ___mark_reg_known(reg, reg->umin_value);
+ ___mark_reg_known(reg, reg_umin(reg));
} else if (!umin_in_tnum && tnum_next == tmax) {
/* The u64 range and the tnum only overlap in the maximum value
* represented by the tnum, called tmax.
@@ -1989,8 +2022,8 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg)
* tnum: xx-----x--------
*/
___mark_reg_known(reg, tmax);
- } else if (!umin_in_tnum && tnum_next <= reg->umax_value &&
- tnum_step(reg->var_off, tnum_next) > reg->umax_value) {
+ } else if (!umin_in_tnum && tnum_next <= reg_umax(reg) &&
+ tnum_step(reg->var_off, tnum_next) > reg_umax(reg)) {
/* The u64 range and the tnum only overlap in between umin
* (excluded) and umax.
* u64: ---[xxxxxx]-----
@@ -2006,329 +2039,19 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
__update_reg64_bounds(reg);
}
-/* Uses signed min/max values to inform unsigned, and vice-versa */
static void deduce_bounds_32_from_64(struct bpf_reg_state *reg)
{
- /* If upper 32 bits of u64/s64 range don't change, we can use lower 32
- * bits to improve our u32/s32 boundaries.
- *
- * E.g., the case where we have upper 32 bits as zero ([10, 20] in
- * u64) is pretty trivial, it's obvious that in u32 we'll also have
- * [10, 20] range. But this property holds for any 64-bit range as
- * long as upper 32 bits in that entire range of values stay the same.
- *
- * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
- * in decimal) has the same upper 32 bits throughout all the values in
- * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
- * range.
- *
- * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
- * following the rules outlined below about u64/s64 correspondence
- * (which equally applies to u32 vs s32 correspondence). In general it
- * depends on actual hexadecimal values of 32-bit range. They can form
- * only valid u32, or only valid s32 ranges in some cases.
- *
- * So we use all these insights to derive bounds for subregisters here.
- */
- if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
- /* u64 to u32 casting preserves validity of low 32 bits as
- * a range, if upper 32 bits are the same
- */
- reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
- reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
-
- if ((s32)reg->umin_value <= (s32)reg->umax_value) {
- reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
- reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
- }
- }
- if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
- /* low 32 bits should form a proper u32 range */
- if ((u32)reg->smin_value <= (u32)reg->smax_value) {
- reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
- reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
- }
- /* low 32 bits should form a proper s32 range */
- if ((s32)reg->smin_value <= (s32)reg->smax_value) {
- reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
- reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
- }
- }
- /* Special case where upper bits form a small sequence of two
- * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
- * 0x00000000 is also valid), while lower bits form a proper s32 range
- * going from negative numbers to positive numbers. E.g., let's say we
- * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
- * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
- * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
- * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
- * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
- * upper 32 bits. As a random example, s64 range
- * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
- * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
- */
- if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
- (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
- reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
- reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
- }
- if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
- (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
- reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
- reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
- }
-}
-
-static void deduce_bounds_32_from_32(struct bpf_reg_state *reg)
-{
- /* if u32 range forms a valid s32 range (due to matching sign bit),
- * try to learn from that
- */
- if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
- reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
- reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
- }
- /* If we cannot cross the sign boundary, then signed and unsigned bounds
- * are the same, so combine. This works even in the negative case, e.g.
- * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
- */
- if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
- reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
- reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
- } else {
- if (reg->u32_max_value < (u32)reg->s32_min_value) {
- /* See __reg64_deduce_bounds() for detailed explanation.
- * Refine ranges in the following situation:
- *
- * 0 U32_MAX
- * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] |
- * |----------------------------|----------------------------|
- * |xxxxx s32 range xxxxxxxxx] [xxxxxxx|
- * 0 S32_MAX S32_MIN -1
- */
- reg->s32_min_value = (s32)reg->u32_min_value;
- reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value);
- } else if ((u32)reg->s32_max_value < reg->u32_min_value) {
- /*
- * 0 U32_MAX
- * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] |
- * |----------------------------|----------------------------|
- * |xxxxxxxxx] [xxxxxxxxxxxx s32 range |
- * 0 S32_MAX S32_MIN -1
- */
- reg->s32_max_value = (s32)reg->u32_max_value;
- reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value);
- }
- }
-}
-
-static void deduce_bounds_64_from_64(struct bpf_reg_state *reg)
-{
- /* If u64 range forms a valid s64 range (due to matching sign bit),
- * try to learn from that. Let's do a bit of ASCII art to see when
- * this is happening. Let's take u64 range first:
- *
- * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
- * |-------------------------------|--------------------------------|
- *
- * Valid u64 range is formed when umin and umax are anywhere in the
- * range [0, U64_MAX], and umin <= umax. u64 case is simple and
- * straightforward. Let's see how s64 range maps onto the same range
- * of values, annotated below the line for comparison:
- *
- * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
- * |-------------------------------|--------------------------------|
- * 0 S64_MAX S64_MIN -1
- *
- * So s64 values basically start in the middle and they are logically
- * contiguous to the right of it, wrapping around from -1 to 0, and
- * then finishing as S64_MAX (0x7fffffffffffffff) right before
- * S64_MIN. We can try drawing the continuity of u64 vs s64 values
- * more visually as mapped to sign-agnostic range of hex values.
- *
- * u64 start u64 end
- * _______________________________________________________________
- * / \
- * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
- * |-------------------------------|--------------------------------|
- * 0 S64_MAX S64_MIN -1
- * / \
- * >------------------------------ ------------------------------->
- * s64 continues... s64 end s64 start s64 "midpoint"
- *
- * What this means is that, in general, we can't always derive
- * something new about u64 from any random s64 range, and vice versa.
- *
- * But we can do that in two particular cases. One is when entire
- * u64/s64 range is *entirely* contained within left half of the above
- * diagram or when it is *entirely* contained in the right half. I.e.:
- *
- * |-------------------------------|--------------------------------|
- * ^ ^ ^ ^
- * A B C D
- *
- * [A, B] and [C, D] are contained entirely in their respective halves
- * and form valid contiguous ranges as both u64 and s64 values. [A, B]
- * will be non-negative both as u64 and s64 (and in fact it will be
- * identical ranges no matter the signedness). [C, D] treated as s64
- * will be a range of negative values, while in u64 it will be
- * non-negative range of values larger than 0x8000000000000000.
- *
- * Now, any other range here can't be represented in both u64 and s64
- * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
- * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
- * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
- * for example. Similarly, valid s64 range [D, A] (going from negative
- * to positive values), would be two separate [D, U64_MAX] and [0, A]
- * ranges as u64. Currently reg_state can't represent two segments per
- * numeric domain, so in such situations we can only derive maximal
- * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
- *
- * So we use these facts to derive umin/umax from smin/smax and vice
- * versa only if they stay within the same "half". This is equivalent
- * to checking sign bit: lower half will have sign bit as zero, upper
- * half have sign bit 1. Below in code we simplify this by just
- * casting umin/umax as smin/smax and checking if they form valid
- * range, and vice versa. Those are equivalent checks.
- */
- if ((s64)reg->umin_value <= (s64)reg->umax_value) {
- reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
- reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
- }
- /* If we cannot cross the sign boundary, then signed and unsigned bounds
- * are the same, so combine. This works even in the negative case, e.g.
- * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
- */
- if ((u64)reg->smin_value <= (u64)reg->smax_value) {
- reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
- reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
- } else {
- /* If the s64 range crosses the sign boundary, then it's split
- * between the beginning and end of the U64 domain. In that
- * case, we can derive new bounds if the u64 range overlaps
- * with only one end of the s64 range.
- *
- * In the following example, the u64 range overlaps only with
- * positive portion of the s64 range.
- *
- * 0 U64_MAX
- * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] |
- * |----------------------------|----------------------------|
- * |xxxxx s64 range xxxxxxxxx] [xxxxxxx|
- * 0 S64_MAX S64_MIN -1
- *
- * We can thus derive the following new s64 and u64 ranges.
- *
- * 0 U64_MAX
- * | [xxxxxx u64 range xxxxx] |
- * |----------------------------|----------------------------|
- * | [xxxxxx s64 range xxxxx] |
- * 0 S64_MAX S64_MIN -1
- *
- * If they overlap in two places, we can't derive anything
- * because reg_state can't represent two ranges per numeric
- * domain.
- *
- * 0 U64_MAX
- * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] |
- * |----------------------------|----------------------------|
- * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx|
- * 0 S64_MAX S64_MIN -1
- *
- * The first condition below corresponds to the first diagram
- * above.
- */
- if (reg->umax_value < (u64)reg->smin_value) {
- reg->smin_value = (s64)reg->umin_value;
- reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value);
- } else if ((u64)reg->smax_value < reg->umin_value) {
- /* This second condition considers the case where the u64 range
- * overlaps with the negative portion of the s64 range:
- *
- * 0 U64_MAX
- * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] |
- * |----------------------------|----------------------------|
- * |xxxxxxxxx] [xxxxxxxxxxxx s64 range |
- * 0 S64_MAX S64_MIN -1
- */
- reg->smax_value = (s64)reg->umax_value;
- reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value);
- }
- }
+ cnum32_intersect_with(&reg->r32, cnum32_from_cnum64(reg->r64));
}
static void deduce_bounds_64_from_32(struct bpf_reg_state *reg)
{
- /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
- * values on both sides of 64-bit range in hope to have tighter range.
- * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
- * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
- * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
- * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
- * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
- * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
- * We just need to make sure that derived bounds we are intersecting
- * with are well-formed ranges in respective s64 or u64 domain, just
- * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
- */
- __u64 new_umin, new_umax;
- __s64 new_smin, new_smax;
-
- /* u32 -> u64 tightening, it's always well-formed */
- new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
- new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
- reg->umin_value = max_t(u64, reg->umin_value, new_umin);
- reg->umax_value = min_t(u64, reg->umax_value, new_umax);
- /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
- new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
- new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
- reg->smin_value = max_t(s64, reg->smin_value, new_smin);
- reg->smax_value = min_t(s64, reg->smax_value, new_smax);
-
- /* Here we would like to handle a special case after sign extending load,
- * when upper bits for a 64-bit range are all 1s or all 0s.
- *
- * Upper bits are all 1s when register is in a range:
- * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff]
- * Upper bits are all 0s when register is in a range:
- * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff]
- * Together this forms are continuous range:
- * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff]
- *
- * Now, suppose that register range is in fact tighter:
- * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R)
- * Also suppose that it's 32-bit range is positive,
- * meaning that lower 32-bits of the full 64-bit register
- * are in the range:
- * [0x0000_0000, 0x7fff_ffff] (W)
- *
- * If this happens, then any value in a range:
- * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff]
- * is smaller than a lowest bound of the range (R):
- * 0xffff_ffff_8000_0000
- * which means that upper bits of the full 64-bit register
- * can't be all 1s, when lower bits are in range (W).
- *
- * Note that:
- * - 0xffff_ffff_8000_0000 == (s64)S32_MIN
- * - 0x0000_0000_7fff_ffff == (s64)S32_MAX
- * These relations are used in the conditions below.
- */
- if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
- reg->smin_value = reg->s32_min_value;
- reg->smax_value = reg->s32_max_value;
- reg->umin_value = reg->s32_min_value;
- reg->umax_value = reg->s32_max_value;
- reg->var_off = tnum_intersect(reg->var_off,
- tnum_range(reg->smin_value, reg->smax_value));
- }
+ reg->r64 = cnum64_cnum32_intersect(reg->r64, reg->r32);
}
static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
- deduce_bounds_64_from_64(reg);
deduce_bounds_32_from_64(reg);
- deduce_bounds_32_from_32(reg);
deduce_bounds_64_from_32(reg);
}
@@ -2336,11 +2059,11 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg)
static void __reg_bound_offset(struct bpf_reg_state *reg)
{
struct tnum var64_off = tnum_intersect(reg->var_off,
- tnum_range(reg->umin_value,
- reg->umax_value));
+ tnum_range(reg_umin(reg),
+ reg_umax(reg)));
struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
- tnum_range(reg->u32_min_value,
- reg->u32_max_value));
+ tnum_range(reg_u32_min(reg),
+ reg_u32_max(reg)));
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
@@ -2366,35 +2089,25 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
__update_reg_bounds(reg);
}
-static bool range_bounds_violation(struct bpf_reg_state *reg)
-{
- return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value ||
- reg->u32_min_value > reg->u32_max_value ||
- reg->s32_min_value > reg->s32_max_value);
-}
-
static bool const_tnum_range_mismatch(struct bpf_reg_state *reg)
{
- u64 uval = reg->var_off.value;
- s64 sval = (s64)uval;
-
if (!tnum_is_const(reg->var_off))
return false;
- return reg->umin_value != uval || reg->umax_value != uval ||
- reg->smin_value != sval || reg->smax_value != sval;
+ return !cnum64_is_const(reg->r64) || reg->r64.base != reg->var_off.value;
}
static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg)
{
- u32 uval32 = tnum_subreg(reg->var_off).value;
- s32 sval32 = (s32)uval32;
-
if (!tnum_subreg_is_const(reg->var_off))
return false;
- return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
- reg->s32_min_value != sval32 || reg->s32_max_value != sval32;
+ return !cnum32_is_const(reg->r32) || reg->r32.base != tnum_subreg(reg->var_off).value;
+}
+
+static bool range_bounds_violation(struct bpf_reg_state *reg)
+{
+ return cnum32_is_empty(reg->r32) || cnum64_is_empty(reg->r64);
}
static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
@@ -2419,12 +2132,11 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
return 0;
out:
- verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
- "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)",
- ctx, msg, reg->umin_value, reg->umax_value,
- reg->smin_value, reg->smax_value,
- reg->u32_min_value, reg->u32_max_value,
- reg->s32_min_value, reg->s32_max_value,
+ verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s r64={.base=%#llx, .size=%#llx} "
+ "r32={.base=%#x, .size=%#x} var_off=(%#llx, %#llx)",
+ ctx, msg,
+ reg->r64.base, reg->r64.size,
+ reg->r32.base, reg->r32.size,
reg->var_off.value, reg->var_off.mask);
if (env->test_reg_invariants)
return -EFAULT;
@@ -2432,44 +2144,15 @@ out:
return 0;
}
-static bool __reg32_bound_s64(s32 a)
-{
- return a >= 0 && a <= S32_MAX;
-}
-
-static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
-{
- reg->umin_value = reg->u32_min_value;
- reg->umax_value = reg->u32_max_value;
-
- /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
- * be positive otherwise set to worse case bounds and refine later
- * from tnum.
- */
- if (__reg32_bound_s64(reg->s32_min_value) &&
- __reg32_bound_s64(reg->s32_max_value)) {
- reg->smin_value = reg->s32_min_value;
- reg->smax_value = reg->s32_max_value;
- } else {
- reg->smin_value = 0;
- reg->smax_value = U32_MAX;
- }
-}
-
/* Mark a register as having a completely unknown (scalar) value. */
void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{
- /*
- * Clear type, off, and union(map_ptr, range) and
- * padding between 'type' and union
- */
- memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
+ s32 subreg_def = reg->subreg_def;
+
+ memset(reg, 0, sizeof(*reg));
reg->type = SCALAR_VALUE;
- reg->id = 0;
- reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
- reg->frameno = 0;
- reg->precise = false;
+ reg->subreg_def = subreg_def;
__mark_reg_unbounded(reg);
}
@@ -2497,11 +2180,12 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
- reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min);
- reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max);
-
- reg->smin_value = max_t(s64, reg->smin_value, s32_min);
- reg->smax_value = min_t(s64, reg->smax_value, s32_max);
+ reg_set_srange32(reg,
+ max_t(s32, reg_s32_min(reg), s32_min),
+ min_t(s32, reg_s32_max(reg), s32_max));
+ reg_set_srange64(reg,
+ max_t(s64, reg_smin(reg), s32_min),
+ min_t(s64, reg_smax(reg), s32_max));
reg_bounds_sync(reg);
@@ -3296,50 +2980,13 @@ out:
return ret;
}
-static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- int spi, int nr_slots)
+static void mark_stack_slots_scratched(struct bpf_verifier_env *env,
+ int spi, int nr_slots)
{
int i;
for (i = 0; i < nr_slots; i++)
mark_stack_slot_scratched(env, spi - i);
- return 0;
-}
-
-static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
-{
- int spi;
-
- /* For CONST_PTR_TO_DYNPTR, it must have already been done by
- * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
- * check_kfunc_call.
- */
- if (reg->type == CONST_PTR_TO_DYNPTR)
- return 0;
- spi = dynptr_get_spi(env, reg);
- if (spi < 0)
- return spi;
- /* Caller ensures dynptr is valid and initialized, which means spi is in
- * bounds and spi is the first dynptr slot. Simply mark stack slot as
- * read.
- */
- return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
-}
-
-static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- int spi, int nr_slots)
-{
- return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
-}
-
-static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
-{
- int spi;
-
- spi = irq_flag_get_spi(env, reg);
- if (spi < 0)
- return spi;
- return mark_stack_slot_obj_read(env, reg, spi, 1);
}
/* This function is supposed to be used by the following 32-bit optimization
@@ -3492,17 +3139,12 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return __check_reg_arg(env, state->regs, regno, t);
}
-static int insn_stack_access_flags(int frameno, int spi)
-{
- return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
-}
-
static void mark_indirect_target(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].indirect_target = true;
}
-#define LR_FRAMENO_BITS 3
+#define LR_FRAMENO_BITS 4
#define LR_SPI_BITS 6
#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1)
#define LR_SIZE_BITS 4
@@ -3511,7 +3153,11 @@ static void mark_indirect_target(struct bpf_verifier_env *env, int idx)
#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1)
#define LR_SPI_OFF LR_FRAMENO_BITS
#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS)
-#define LINKED_REGS_MAX 6
+#define LINKED_REGS_MAX 5
+
+static_assert(MAX_CALL_FRAMES <= (1 << LR_FRAMENO_BITS));
+static_assert(LINKED_REGS_MAX < (1 << LR_SIZE_BITS));
+static_assert(LINKED_REGS_MAX * LR_ENTRY_BITS + LR_SIZE_BITS <= 64);
struct linked_reg {
u8 frameno;
@@ -3535,10 +3181,11 @@ static struct linked_reg *linked_regs_push(struct linked_regs *s)
return NULL;
}
-/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
+/*
+ * Use u64 as a vector of 5 11-bit values, use first 4-bits to track
* number of elements currently in stack.
- * Pack one history entry for linked registers as 10 bits in the following format:
- * - 3-bits frameno
+ * Pack one history entry for linked registers as 11 bits in the following format:
+ * - 4-bits frameno
* - 6-bits spi_or_reg
* - 1-bit is_reg
*/
@@ -3734,12 +3381,6 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
src_reg->id = ++env->id_gen;
}
-/* Copy src state preserving dst->parent and dst->live fields */
-static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
-{
- *dst = *src;
-}
-
static void save_register_state(struct bpf_verifier_env *env,
struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
@@ -3747,7 +3388,7 @@ static void save_register_state(struct bpf_verifier_env *env,
{
int i;
- copy_register_state(&state->stack[spi].spilled_ptr, reg);
+ state->stack[spi].spilled_ptr = *reg;
for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
state->stack[spi].slot_type[i - 1] = STACK_SPILL;
@@ -3764,7 +3405,7 @@ static bool is_bpf_st_mem(struct bpf_insn *insn)
static int get_reg_width(struct bpf_reg_state *reg)
{
- return fls64(reg->umax_value);
+ return fls64(reg_umax(reg));
}
/* See comment for mark_fastcall_pattern_for_call() */
@@ -3817,7 +3458,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
struct bpf_reg_state *reg = NULL;
- int insn_flags = insn_stack_access_flags(state->frameno, spi);
+ int insn_flags = INSN_F_STACK_ACCESS;
+ int hist_spi = spi, hist_frame = state->frameno;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
@@ -3913,11 +3555,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return bpf_push_jmp_history(env, env->cur_state, insn_flags,
+ hist_spi, hist_frame, 0);
return 0;
}
-/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
+/* Write the stack: 'stack[ptr_reg + off] = value_regno'. 'ptr_reg' is
* known to contain a variable offset.
* This function checks whether the write is permitted and conservatively
* tracks the effects of the write, considering that each stack slot in the
@@ -3938,13 +3581,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
static int check_stack_write_var_off(struct bpf_verifier_env *env,
/* func where register points to */
struct bpf_func_state *state,
- int ptr_regno, int off, int size,
+ struct bpf_reg_state *ptr_reg, int off, int size,
int value_regno, int insn_idx)
{
struct bpf_func_state *cur; /* state of the current function */
int min_off, max_off;
int i, err;
- struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+ struct bpf_reg_state *value_reg = NULL;
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
bool writing_zero = false;
/* set if the fact that we're writing a zero is used to let any
@@ -3953,9 +3596,8 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
bool zero_used = false;
cur = env->cur_state->frame[env->cur_state->curframe];
- ptr_reg = &cur->regs[ptr_regno];
- min_off = ptr_reg->smin_value + off;
- max_off = ptr_reg->smax_value + off + size;
+ min_off = reg_smin(ptr_reg) + off;
+ max_off = reg_smax(ptr_reg) + off + size;
if (value_regno >= 0)
value_reg = &cur->regs[value_regno];
if ((value_reg && bpf_register_is_null(value_reg)) ||
@@ -4110,7 +3752,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
struct bpf_reg_state *reg;
u8 *stype, type;
- int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
+ int insn_flags = INSN_F_STACK_ACCESS;
+ int hist_spi = spi, hist_frame = reg_state->frameno;
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
@@ -4147,7 +3790,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
* with the destination register on fill.
*/
assign_scalar_id_before_mov(env, reg);
- copy_register_state(&state->regs[dst_regno], reg);
+ state->regs[dst_regno] = *reg;
state->regs[dst_regno].subreg_def = subreg_def;
/* Break the relation on a narrowing fill.
@@ -4202,7 +3845,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
* with the destination register on fill.
*/
assign_scalar_id_before_mov(env, reg);
- copy_register_state(&state->regs[dst_regno], reg);
+ state->regs[dst_regno] = *reg;
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
@@ -4241,7 +3884,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return bpf_push_jmp_history(env, env->cur_state, insn_flags,
+ hist_spi, hist_frame, 0);
return 0;
}
@@ -4250,8 +3894,8 @@ enum bpf_access_src {
ACCESS_HELPER = 2, /* the access is performed by a helper */
};
-static int check_stack_range_initialized(struct bpf_verifier_env *env,
- int regno, int off, int access_size,
+static int check_stack_range_initialized(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ argno_t argno, int off, int access_size,
bool zero_size_allowed,
enum bpf_access_type type,
struct bpf_call_arg_meta *meta);
@@ -4261,37 +3905,35 @@ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
return cur_regs(env) + regno;
}
-/* Read the stack at 'ptr_regno + off' and put the result into the register
+/* Read the stack at 'reg + off' and put the result into the register
* 'dst_regno'.
- * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
+ * 'off' includes the pointer register's fixed offset(i.e. 'reg->off'),
* but not its variable offset.
* 'size' is assumed to be <= reg size and the access is assumed to be aligned.
*
* As opposed to check_stack_read_fixed_off, this function doesn't deal with
* filling registers (i.e. reads of spilled register cannot be detected when
* the offset is not fixed). We conservatively mark 'dst_regno' as containing
- * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
+ * SCALAR_VALUE. That's why we assert that the 'reg' has a variable
* offset; for a fixed offset check_stack_read_fixed_off should be used
* instead.
*/
-static int check_stack_read_var_off(struct bpf_verifier_env *env,
- int ptr_regno, int off, int size, int dst_regno)
+static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ argno_t ptr_argno, int off, int size, int dst_regno)
{
- /* The state of the source register. */
- struct bpf_reg_state *reg = reg_state(env, ptr_regno);
struct bpf_func_state *ptr_state = bpf_func(env, reg);
int err;
int min_off, max_off;
/* Note that we pass a NULL meta, so raw access will not be permitted.
*/
- err = check_stack_range_initialized(env, ptr_regno, off, size,
+ err = check_stack_range_initialized(env, reg, ptr_argno, off, size,
false, BPF_READ, NULL);
if (err)
return err;
- min_off = reg->smin_value + off;
- max_off = reg->smax_value + off;
+ min_off = reg_smin(reg) + off;
+ max_off = reg_smax(reg) + off;
mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off);
return 0;
@@ -4307,10 +3949,9 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
* can be -1, meaning that the read value is not going to a register.
*/
static int check_stack_read(struct bpf_verifier_env *env,
- int ptr_regno, int off, int size,
+ struct bpf_reg_state *reg, argno_t ptr_argno, int off, int size,
int dst_regno)
{
- struct bpf_reg_state *reg = reg_state(env, ptr_regno);
struct bpf_func_state *state = bpf_func(env, reg);
int err;
/* Some accesses are only permitted with a static offset. */
@@ -4346,7 +3987,7 @@ static int check_stack_read(struct bpf_verifier_env *env,
* than fixed offset ones. Note that dst_regno >= 0 on this
* branch.
*/
- err = check_stack_read_var_off(env, ptr_regno, off, size,
+ err = check_stack_read_var_off(env, reg, ptr_argno, off, size,
dst_regno);
}
return err;
@@ -4356,17 +3997,16 @@ static int check_stack_read(struct bpf_verifier_env *env,
/* check_stack_write dispatches to check_stack_write_fixed_off or
* check_stack_write_var_off.
*
- * 'ptr_regno' is the register used as a pointer into the stack.
+ * 'reg' is the register used as a pointer into the stack.
* 'value_regno' is the register whose value we're writing to the stack. It can
* be -1, meaning that we're not writing from a register.
*
* The caller must ensure that the offset falls within the maximum stack size.
*/
static int check_stack_write(struct bpf_verifier_env *env,
- int ptr_regno, int off, int size,
+ struct bpf_reg_state *reg, int off, int size,
int value_regno, int insn_idx)
{
- struct bpf_reg_state *reg = reg_state(env, ptr_regno);
struct bpf_func_state *state = bpf_func(env, reg);
int err;
@@ -4379,28 +4019,135 @@ static int check_stack_write(struct bpf_verifier_env *env,
* than fixed offset ones.
*/
err = check_stack_write_var_off(env, state,
- ptr_regno, off, size,
+ reg, off, size,
value_regno, insn_idx);
}
return err;
}
-static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
+/*
+ * Write a value to the outgoing stack arg area.
+ * off is a negative offset from r11 (e.g. -8 for arg6, -16 for arg7).
+ */
+static int check_stack_arg_write(struct bpf_verifier_env *env, struct bpf_func_state *state,
+ int off, struct bpf_reg_state *value_reg)
+{
+ int max_stack_arg_regs = MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS;
+ struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+ int spi = -off / BPF_REG_SIZE - 1;
+ struct bpf_reg_state *arg;
+ int err;
+
+ if (spi >= max_stack_arg_regs) {
+ verbose(env, "stack arg write offset %d exceeds max %d stack args\n",
+ off, max_stack_arg_regs);
+ return -EINVAL;
+ }
+
+ err = grow_stack_arg_slots(env, state, spi + 1);
+ if (err)
+ return err;
+
+ /* Track the max outgoing stack arg slot count. */
+ if (spi + 1 > subprog->max_out_stack_arg_cnt)
+ subprog->max_out_stack_arg_cnt = spi + 1;
+
+ if (value_reg) {
+ state->stack_arg_regs[spi] = *value_reg;
+ } else {
+ /* BPF_ST: store immediate, treat as scalar */
+ arg = &state->stack_arg_regs[spi];
+ arg->type = SCALAR_VALUE;
+ __mark_reg_known(arg, env->prog->insnsi[env->insn_idx].imm);
+ }
+ state->no_stack_arg_load = true;
+ return bpf_push_jmp_history(env, env->cur_state,
+ INSN_F_STACK_ARG_ACCESS, spi, 0, 0);
+}
+
+/*
+ * Read a value from the incoming stack arg area.
+ * off is a positive offset from r11 (e.g. +8 for arg6, +16 for arg7).
+ */
+static int check_stack_arg_read(struct bpf_verifier_env *env, struct bpf_func_state *state,
+ int off, int dst_regno)
+{
+ struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+ struct bpf_verifier_state *vstate = env->cur_state;
+ int spi = off / BPF_REG_SIZE - 1;
+ struct bpf_func_state *caller, *cur;
+ struct bpf_reg_state *arg;
+
+ if (state->no_stack_arg_load) {
+ verbose(env, "r11 load must be before any r11 store or call insn\n");
+ return -EINVAL;
+ }
+
+ if (spi + 1 > bpf_in_stack_arg_cnt(subprog)) {
+ verbose(env, "invalid read from stack arg off %d depth %d\n",
+ off, bpf_in_stack_arg_cnt(subprog) * BPF_REG_SIZE);
+ return -EACCES;
+ }
+
+ caller = vstate->frame[vstate->curframe - 1];
+ arg = &caller->stack_arg_regs[spi];
+ cur = vstate->frame[vstate->curframe];
+ cur->regs[dst_regno] = *arg;
+ return bpf_push_jmp_history(env, env->cur_state,
+ INSN_F_STACK_ARG_ACCESS, spi, 0, 0);
+}
+
+static int mark_stack_arg_precision(struct bpf_verifier_env *env, int arg_idx)
+{
+ struct bpf_func_state *caller = cur_func(env);
+ int spi = arg_idx - MAX_BPF_FUNC_REG_ARGS;
+
+ bt_set_frame_stack_arg_slot(&env->bt, caller->frameno, spi);
+ return mark_chain_precision_batch(env, env->cur_state);
+}
+
+static int check_outgoing_stack_args(struct bpf_verifier_env *env, struct bpf_func_state *caller,
+ int nargs)
+{
+ int i, spi;
+
+ for (i = MAX_BPF_FUNC_REG_ARGS; i < nargs; i++) {
+ spi = i - MAX_BPF_FUNC_REG_ARGS;
+ if (spi >= caller->out_stack_arg_cnt ||
+ caller->stack_arg_regs[spi].type == NOT_INIT) {
+ verbose(env, "callee expects %d args, stack arg%d is not initialized\n",
+ nargs, spi + 1);
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+static struct bpf_reg_state *get_func_arg_reg(struct bpf_func_state *caller,
+ struct bpf_reg_state *regs, int arg)
+{
+ if (arg < MAX_BPF_FUNC_REG_ARGS)
+ return &regs[arg + 1];
+
+ return &caller->stack_arg_regs[arg - MAX_BPF_FUNC_REG_ARGS];
+}
+
+static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int off, int size, enum bpf_access_type type)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_map *map = reg->map_ptr;
u32 cap = bpf_map_flags_to_cap(map);
if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n",
- map->value_size, reg->smin_value + off, size);
+ map->value_size, reg_smin(reg) + off, size);
return -EACCES;
}
if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n",
- map->value_size, reg->smin_value + off, size);
+ map->value_size, reg_smin(reg) + off, size);
return -EACCES;
}
@@ -4408,17 +4155,15 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
}
/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
-static int __check_mem_access(struct bpf_verifier_env *env, int regno,
+static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
int off, int size, u32 mem_size,
bool zero_size_allowed)
{
bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
- struct bpf_reg_state *reg;
if (off >= 0 && size_ok && (u64)off + size <= mem_size)
return 0;
- reg = &cur_regs(env)[regno];
switch (reg->type) {
case PTR_TO_MAP_KEY:
verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
@@ -4431,8 +4176,8 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END:
- verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
- off, size, regno, reg->id, off, mem_size);
+ verbose(env, "invalid access to packet, off=%d size=%d, %s(id=%d,off=%d,r=%d)\n",
+ off, size, reg_arg_name(env, argno), reg->id, off, mem_size);
break;
case PTR_TO_CTX:
verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n",
@@ -4448,13 +4193,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
}
/* check read/write into a memory region with possible variable offset */
-static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
+static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
int off, int size, u32 mem_size,
bool zero_size_allowed)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *reg = &state->regs[regno];
int err;
/* We may have adjusted the register pointing to memory region, so we
@@ -4467,36 +4209,36 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if (reg->smin_value < 0 &&
- (reg->smin_value == S64_MIN ||
- (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
- reg->smin_value + off < 0)) {
- verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
- regno);
+ if (reg_smin(reg) < 0 &&
+ (reg_smin(reg) == S64_MIN ||
+ (off + reg_smin(reg) != (s64)(s32)(off + reg_smin(reg))) ||
+ reg_smin(reg) + off < 0)) {
+ verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ reg_arg_name(env, argno));
return -EACCES;
}
- err = __check_mem_access(env, regno, reg->smin_value + off, size,
+ err = __check_mem_access(env, reg, argno, reg_smin(reg) + off, size,
mem_size, zero_size_allowed);
if (err) {
- verbose(env, "R%d min value is outside of the allowed memory range\n",
- regno);
+ verbose(env, "%s min value is outside of the allowed memory range\n",
+ reg_arg_name(env, argno));
return err;
}
/* If we haven't set a max value then we need to bail since we can't be
* sure we won't do bad things.
- * If reg->umax_value + off could overflow, treat that as unbounded too.
+ * If reg_umax(reg) + off could overflow, treat that as unbounded too.
*/
- if (reg->umax_value >= BPF_MAX_VAR_OFF) {
- verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
- regno);
+ if (reg_umax(reg) >= BPF_MAX_VAR_OFF) {
+ verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n",
+ reg_arg_name(env, argno));
return -EACCES;
}
- err = __check_mem_access(env, regno, reg->umax_value + off, size,
+ err = __check_mem_access(env, reg, argno, reg_umax(reg) + off, size,
mem_size, zero_size_allowed);
if (err) {
- verbose(env, "R%d max value is outside of the allowed memory range\n",
- regno);
+ verbose(env, "%s max value is outside of the allowed memory range\n",
+ reg_arg_name(env, argno));
return err;
}
@@ -4504,7 +4246,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
}
static int __check_ptr_off_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno,
+ const struct bpf_reg_state *reg, argno_t argno,
bool fixed_off_ok)
{
/* Access to this pointer-typed register or passing it to a helper
@@ -4520,15 +4262,15 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
return -EACCES;
}
- if (reg->smin_value < 0) {
- verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n",
- reg_type_str(env, reg->type), regno, reg->var_off.value);
+ if (reg_smin(reg) < 0) {
+ verbose(env, "negative offset %s ptr %s off=%lld disallowed\n",
+ reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value);
return -EACCES;
}
if (!fixed_off_ok && reg->var_off.value != 0) {
- verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n",
- reg_type_str(env, reg->type), regno, reg->var_off.value);
+ verbose(env, "dereference of modified %s ptr %s off=%lld disallowed\n",
+ reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value);
return -EACCES;
}
@@ -4538,7 +4280,7 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
static int check_ptr_off_reg(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg, int regno)
{
- return __check_ptr_off_reg(env, reg, regno, false);
+ return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false);
}
static int map_kptr_match_type(struct bpf_verifier_env *env,
@@ -4574,9 +4316,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
* referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
* normal store of unreferenced kptr, we must ensure var_off is zero.
* Since ref_ptr cannot be accessed directly by BPF insns, check for
- * reg->ref_obj_id is not needed here.
+ * reg->id is not needed here.
*/
- if (__check_ptr_off_reg(env, reg, regno, true))
+ if (__check_ptr_off_reg(env, reg, argno_from_reg(regno), true))
return -EACCES;
/* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
@@ -4719,7 +4461,7 @@ static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
-static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+static int check_map_kptr_access(struct bpf_verifier_env *env,
int value_regno, int insn_idx,
struct btf_field *kptr_field)
{
@@ -4796,19 +4538,16 @@ static u32 map_mem_size(const struct bpf_map *map)
}
/* check read/write into a map element with possible variable offset */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno,
+static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
int off, int size, bool zero_size_allowed,
enum bpf_access_src src)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
u32 mem_size = map_mem_size(map);
struct btf_record *rec;
int err, i;
- err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed);
+ err = check_mem_region_access(env, reg, argno, off, size, mem_size, zero_size_allowed);
if (err)
return err;
@@ -4823,8 +4562,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < p + field->size &&
- p < reg->umax_value + off + size) {
+ if (reg_smin(reg) + off < p + field->size &&
+ p < reg_umax(reg) + off + size) {
switch (field->type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
@@ -4904,30 +4643,29 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
}
-static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off,
int size, bool zero_size_allowed)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
int err;
if (reg->range < 0) {
- verbose(env, "R%d offset is outside of the packet\n", regno);
+ verbose(env, "%s offset is outside of the packet\n", reg_arg_name(env, argno));
return -EINVAL;
}
- err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed);
+ err = check_mem_region_access(env, reg, argno, off, size, reg->range, zero_size_allowed);
if (err)
return err;
/* __check_mem_access has made sure "off + size - 1" is within u16.
- * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
+ * reg_umax(reg) can't be bigger than MAX_PACKET_OFF which is 0xffff,
* otherwise find_good_pkt_pointers would have refused to set range info
* that __check_mem_access would have rejected this pkt access.
- * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
+ * Therefore, "off + reg_umax(reg) + size - 1" won't overflow u32.
*/
env->prog->aux->max_pkt_offset =
max_t(u32, env->prog->aux->max_pkt_offset,
- off + reg->umax_value + size - 1);
+ off + reg_umax(reg) + size - 1);
return 0;
}
@@ -4951,8 +4689,8 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of
* type of narrower access.
*/
if (base_type(info->reg_type) == PTR_TO_BTF_ID) {
- if (info->ref_obj_id &&
- !find_reference_state(env->cur_state, info->ref_obj_id)) {
+ if (info->ref_id &&
+ !find_reference_state(env->cur_state, info->ref_id)) {
verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
off);
return -EACCES;
@@ -4970,7 +4708,7 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of
return -EACCES;
}
-static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno,
int off, int access_size, enum bpf_access_type t,
struct bpf_insn_access_aux *info)
{
@@ -4980,17 +4718,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
*/
bool var_off_ok = is_var_ctx_off_allowed(env->prog);
bool fixed_off_ok = !env->ops->convert_ctx_access;
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = regs + regno;
int err;
if (var_off_ok)
- err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false);
+ err = check_mem_region_access(env, reg, argno, off, access_size, U16_MAX, false);
else
- err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
+ err = __check_ptr_off_reg(env, reg, argno, fixed_off_ok);
if (err)
return err;
- off += reg->umax_value;
+ off += reg_umax(reg);
err = __check_ctx_access(env, insn_idx, off, access_size, t, info);
if (err)
@@ -4998,9 +4734,21 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
}
-static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
- int size)
+static int check_flow_keys_access(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, argno_t argno,
+ int off, int size)
{
+ /* Only a constant offset is allowed here; fold it into off. */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "%s invalid variable offset to flow keys: off=%d, var_off=%s\n",
+ reg_arg_name(env, argno), off, tn_buf);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
+
if (size < 0 || off < 0 ||
(u64)off + size > sizeof(struct bpf_flow_keys)) {
verbose(env, "invalid access to flow keys off=%d size=%d\n",
@@ -5011,16 +4759,15 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
}
static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
- u32 regno, int off, int size,
+ struct bpf_reg_state *reg, argno_t argno, int off, int size,
enum bpf_access_type t)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_insn_access_aux info = {};
bool valid;
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
- regno);
+ if (reg_smin(reg) < 0) {
+ verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ reg_arg_name(env, argno));
return -EACCES;
}
@@ -5048,8 +4795,8 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
return 0;
}
- verbose(env, "R%d invalid %s access off=%d size=%d\n",
- regno, reg_type_str(env, reg->type), off, size);
+ verbose(env, "%s invalid %s access off=%d size=%d\n",
+ reg_arg_name(env, argno), reg_type_str(env, reg->type), off, size);
return -EACCES;
}
@@ -5124,10 +4871,10 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
[CONST_PTR_TO_MAP] = btf_bpf_map_id,
};
-static bool is_trusted_reg(const struct bpf_reg_state *reg)
+static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
{
/* A referenced register is always trusted. */
- if (reg->ref_obj_id)
+ if (reg_is_referenced(env, reg))
return true;
/* Types listed in the reg2btf_ids are always trusted */
@@ -5369,7 +5116,10 @@ process_func:
}
subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
- if (priv_stack_supported) {
+ if (IS_ENABLED(CONFIG_X86_64) && subprog[idx].stack_arg_cnt) {
+ /* x86-64 uses R9 for both private stack frame pointer and arg6. */
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
+ } else if (priv_stack_supported) {
/* Request private stack support only if the subprog stack
* depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
* avoid jit penalty if the stack usage is small.
@@ -5380,6 +5130,8 @@ process_func:
}
if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ if (subprog_depth > env->max_stack_depth)
+ env->max_stack_depth = subprog_depth;
if (subprog_depth > MAX_BPF_STACK) {
verbose(env, "stack size of subprog %d is %d. Too large\n",
idx, subprog_depth);
@@ -5387,6 +5139,8 @@ process_func:
}
} else {
depth += subprog_depth;
+ if (depth > env->max_stack_depth)
+ env->max_stack_depth = depth;
if (depth > MAX_BPF_STACK) {
total = 0;
for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller)
@@ -5405,7 +5159,7 @@ continue_func:
if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
bool err = false;
- if (!is_bpf_throw_kfunc(insn + i))
+ if (!bpf_is_throw_kfunc(insn + i))
continue;
for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) {
if (subprog[tmp].is_cb) {
@@ -5473,14 +5227,23 @@ continue_func:
* this info will be utilized by JIT so that we will be preserving the
* tail call counter throughout bpf2bpf calls combined with tailcalls
*/
- if (tail_call_reachable)
+ if (tail_call_reachable) {
for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) {
if (subprog[tmp].is_exception_cb) {
verbose(env, "cannot tail call within exception cb\n");
return -EINVAL;
}
+ if (subprog[tmp].stack_arg_cnt) {
+ verbose(env, "tail_calls are not allowed in programs with stack args\n");
+ return -EINVAL;
+ }
subprog[tmp].tail_call_reachable = true;
}
+ } else if (!idx && subprog[0].has_tail_call && subprog[0].stack_arg_cnt) {
+ verbose(env, "tail_calls are not allowed in programs with stack args\n");
+ return -EINVAL;
+ }
+
if (subprog[0].tail_call_reachable)
env->prog->aux->tail_call_reachable = true;
@@ -5499,6 +5262,9 @@ continue_func:
frame = dinfo[idx].frame;
i = dinfo[idx].ret_insn;
+ /* reset tail_call_reachable to the parent's actual state */
+ tail_call_reachable = subprog[idx].tail_call_reachable;
+
goto continue_func;
}
@@ -5559,12 +5325,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
static int __check_buffer_access(struct bpf_verifier_env *env,
const char *buf_info,
const struct bpf_reg_state *reg,
- int regno, int off, int size)
+ argno_t argno, int off, int size)
{
if (off < 0) {
verbose(env,
- "R%d invalid %s buffer access: off=%d, size=%d\n",
- regno, buf_info, off, size);
+ "%s invalid %s buffer access: off=%d, size=%d\n",
+ reg_arg_name(env, argno), buf_info, off, size);
return -EACCES;
}
if (!tnum_is_const(reg->var_off)) {
@@ -5572,8 +5338,8 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env,
- "R%d invalid variable buffer offset: off=%d, var_off=%s\n",
- regno, off, tn_buf);
+ "%s invalid variable buffer offset: off=%d, var_off=%s\n",
+ reg_arg_name(env, argno), off, tn_buf);
return -EACCES;
}
@@ -5582,11 +5348,11 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
static int check_tp_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
- int regno, int off, int size)
+ argno_t argno, int off, int size)
{
int err;
- err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
+ err = __check_buffer_access(env, "tracepoint", reg, argno, off, size);
if (err)
return err;
@@ -5598,14 +5364,14 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
static int check_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
- int regno, int off, int size,
+ argno_t argno, int off, int size,
bool zero_size_allowed,
u32 *max_access)
{
const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
int err;
- err = __check_buffer_access(env, buf_info, reg, regno, off, size);
+ err = __check_buffer_access(env, buf_info, reg, argno, off, size);
if (err)
return err;
@@ -5618,7 +5384,7 @@ static int check_buffer_access(struct bpf_verifier_env *env,
static void zext_32_to_64(struct bpf_reg_state *reg)
{
reg->var_off = tnum_subreg(reg->var_off);
- __reg_assign_32_into_64(reg);
+ reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg));
}
/* truncate register to smaller size (in bytes)
@@ -5633,15 +5399,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
/* fix arithmetic bounds */
mask = ((u64)1 << (size * 8)) - 1;
- if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
- reg->umin_value &= mask;
- reg->umax_value &= mask;
- } else {
- reg->umin_value = 0;
- reg->umax_value = mask;
- }
- reg->smin_value = reg->umin_value;
- reg->smax_value = reg->umax_value;
+ if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask))
+ reg_set_urange64(reg, reg_umin(reg) & mask, reg_umax(reg) & mask);
+ else
+ reg_set_urange64(reg, 0, mask);
/* If size is smaller than 32bit register the 32bit register
* values are also truncated so we push 64-bit bounds into
@@ -5656,19 +5417,16 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
{
if (size == 1) {
- reg->smin_value = reg->s32_min_value = S8_MIN;
- reg->smax_value = reg->s32_max_value = S8_MAX;
+ reg_set_srange64(reg, S8_MIN, S8_MAX);
+ reg_set_srange32(reg, S8_MIN, S8_MAX);
} else if (size == 2) {
- reg->smin_value = reg->s32_min_value = S16_MIN;
- reg->smax_value = reg->s32_max_value = S16_MAX;
+ reg_set_srange64(reg, S16_MIN, S16_MAX);
+ reg_set_srange32(reg, S16_MIN, S16_MAX);
} else {
/* size == 4 */
- reg->smin_value = reg->s32_min_value = S32_MIN;
- reg->smax_value = reg->s32_max_value = S32_MAX;
+ reg_set_srange64(reg, S32_MIN, S32_MAX);
+ reg_set_srange32(reg, S32_MIN, S32_MAX);
}
- reg->umin_value = reg->u32_min_value = 0;
- reg->umax_value = U64_MAX;
- reg->u32_max_value = U32_MAX;
reg->var_off = tnum_unknown;
}
@@ -5689,29 +5447,27 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
reg->var_off = tnum_const((s32)u64_cval);
u64_cval = reg->var_off.value;
- reg->smax_value = reg->smin_value = u64_cval;
- reg->umax_value = reg->umin_value = u64_cval;
- reg->s32_max_value = reg->s32_min_value = u64_cval;
- reg->u32_max_value = reg->u32_min_value = u64_cval;
+ reg->r64 = cnum64_from_urange(u64_cval, u64_cval);
+ reg->r32 = cnum32_from_urange((u32)u64_cval, (u32)u64_cval);
return;
}
- top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
- top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;
+ top_smax_value = ((u64)reg_smax(reg) >> num_bits) << num_bits;
+ top_smin_value = ((u64)reg_smin(reg) >> num_bits) << num_bits;
if (top_smax_value != top_smin_value)
goto out;
/* find the s64_min and s64_min after sign extension */
if (size == 1) {
- init_s64_max = (s8)reg->smax_value;
- init_s64_min = (s8)reg->smin_value;
+ init_s64_max = (s8)reg_smax(reg);
+ init_s64_min = (s8)reg_smin(reg);
} else if (size == 2) {
- init_s64_max = (s16)reg->smax_value;
- init_s64_min = (s16)reg->smin_value;
+ init_s64_max = (s16)reg_smax(reg);
+ init_s64_min = (s16)reg_smin(reg);
} else {
- init_s64_max = (s32)reg->smax_value;
- init_s64_min = (s32)reg->smin_value;
+ init_s64_max = (s32)reg_smax(reg);
+ init_s64_min = (s32)reg_smin(reg);
}
s64_max = max(init_s64_max, init_s64_min);
@@ -5719,10 +5475,8 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
/* both of s64_max/s64_min positive or negative */
if ((s64_max >= 0) == (s64_min >= 0)) {
- reg->s32_min_value = reg->smin_value = s64_min;
- reg->s32_max_value = reg->smax_value = s64_max;
- reg->u32_min_value = reg->umin_value = s64_min;
- reg->u32_max_value = reg->umax_value = s64_max;
+ reg_set_srange64(reg, s64_min, s64_max);
+ reg_set_srange32(reg, s64_min, s64_max);
reg->var_off = tnum_range(s64_min, s64_max);
return;
}
@@ -5733,16 +5487,11 @@ out:
static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
{
- if (size == 1) {
- reg->s32_min_value = S8_MIN;
- reg->s32_max_value = S8_MAX;
- } else {
+ if (size == 1)
+ reg_set_srange32(reg, S8_MIN, S8_MAX);
+ else
/* size == 2 */
- reg->s32_min_value = S16_MIN;
- reg->s32_max_value = S16_MAX;
- }
- reg->u32_min_value = 0;
- reg->u32_max_value = U32_MAX;
+ reg_set_srange32(reg, S16_MIN, S16_MAX);
reg->var_off = tnum_subreg(tnum_unknown);
}
@@ -5760,34 +5509,30 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
reg->var_off = tnum_const((s16)u32_val);
u32_val = reg->var_off.value;
- reg->s32_min_value = reg->s32_max_value = u32_val;
- reg->u32_min_value = reg->u32_max_value = u32_val;
+ reg_set_srange32(reg, u32_val, u32_val);
return;
}
- top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
- top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;
+ top_smax_value = ((u32)reg_s32_max(reg) >> num_bits) << num_bits;
+ top_smin_value = ((u32)reg_s32_min(reg) >> num_bits) << num_bits;
if (top_smax_value != top_smin_value)
goto out;
/* find the s32_min and s32_min after sign extension */
if (size == 1) {
- init_s32_max = (s8)reg->s32_max_value;
- init_s32_min = (s8)reg->s32_min_value;
+ init_s32_max = (s8)reg_s32_max(reg);
+ init_s32_min = (s8)reg_s32_min(reg);
} else {
/* size == 2 */
- init_s32_max = (s16)reg->s32_max_value;
- init_s32_min = (s16)reg->s32_min_value;
+ init_s32_max = (s16)reg_s32_max(reg);
+ init_s32_min = (s16)reg_s32_min(reg);
}
s32_max = max(init_s32_max, init_s32_min);
s32_min = min(init_s32_max, init_s32_min);
if ((s32_min >= 0) == (s32_max >= 0)) {
- reg->s32_min_value = s32_min;
- reg->s32_max_value = s32_max;
- reg->u32_min_value = (u32)s32_min;
- reg->u32_max_value = (u32)s32_max;
+ reg_set_srange32(reg, s32_min, s32_max);
reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));
return;
}
@@ -5977,12 +5722,11 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
}
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
- struct bpf_reg_state *regs,
- int regno, int off, int size,
+ struct bpf_reg_state *regs, struct bpf_reg_state *reg,
+ argno_t argno, int off, int size,
enum bpf_access_type atype,
int value_regno)
{
- struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
const char *field_name = NULL;
@@ -6008,8 +5752,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env,
- "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
- regno, tname, off, tn_buf);
+ "%s is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
+ reg_arg_name(env, argno), tname, off, tn_buf);
return -EACCES;
}
@@ -6017,22 +5761,22 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (off < 0) {
verbose(env,
- "R%d is ptr_%s invalid negative access: off=%d\n",
- regno, tname, off);
+ "%s is ptr_%s invalid negative access: off=%d\n",
+ reg_arg_name(env, argno), tname, off);
return -EACCES;
}
if (reg->type & MEM_USER) {
verbose(env,
- "R%d is ptr_%s access user memory: off=%d\n",
- regno, tname, off);
+ "%s is ptr_%s access user memory: off=%d\n",
+ reg_arg_name(env, argno), tname, off);
return -EACCES;
}
if (reg->type & MEM_PERCPU) {
verbose(env,
- "R%d is ptr_%s access percpu memory: off=%d\n",
- regno, tname, off);
+ "%s is ptr_%s access percpu memory: off=%d\n",
+ reg_arg_name(env, argno), tname, off);
return -EACCES;
}
@@ -6044,7 +5788,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
ret = env->ops->btf_struct_access(&env->log, reg, off, size);
} else {
/* Writes are permitted with default btf_struct_access for
- * program allocated objects (which always have ref_obj_id > 0),
+ * program allocated objects (which always have id > 0),
* but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
*/
if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
@@ -6053,8 +5797,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
- !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
- verifier_bug(env, "ref_obj_id for allocated object must be non-zero");
+ !(reg->type & MEM_RCU) && !reg_is_referenced(env, reg)) {
+ verifier_bug(env, "allocated object must have a referenced id");
return -EFAULT;
}
@@ -6073,7 +5817,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
*/
flag = PTR_UNTRUSTED;
- } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+ } else if (is_trusted_reg(env, reg) || is_rcu_reg(reg)) {
/* By default any pointer obtained from walking a trusted pointer is no
* longer trusted, unless the field being accessed has explicitly been
* marked as inheriting its parent's state of trust (either full or RCU).
@@ -6134,12 +5878,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
static int check_ptr_to_map_access(struct bpf_verifier_env *env,
- struct bpf_reg_state *regs,
- int regno, int off, int size,
+ struct bpf_reg_state *regs, struct bpf_reg_state *reg,
+ argno_t argno, int off, int size,
enum bpf_access_type atype,
int value_regno)
{
- struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
struct bpf_reg_state map_reg;
enum bpf_type_flag flag = 0;
@@ -6170,8 +5913,8 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
}
if (off < 0) {
- verbose(env, "R%d is %s invalid negative access: off=%d\n",
- regno, tname, off);
+ verbose(env, "%s is %s invalid negative access: off=%d\n",
+ reg_arg_name(env, argno), tname, off);
return -EACCES;
}
@@ -6228,11 +5971,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
* 'off' includes `regno->offset`, but not its dynamic part (if any).
*/
static int check_stack_access_within_bounds(
- struct bpf_verifier_env *env,
- int regno, int off, int access_size,
+ struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ argno_t argno, int off, int access_size,
enum bpf_access_type type)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = bpf_func(env, reg);
s64 min_off, max_off;
int err;
@@ -6247,14 +5989,14 @@ static int check_stack_access_within_bounds(
min_off = (s64)reg->var_off.value + off;
max_off = min_off + access_size;
} else {
- if (reg->smax_value >= BPF_MAX_VAR_OFF ||
- reg->smin_value <= -BPF_MAX_VAR_OFF) {
- verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
- err_extra, regno);
+ if (reg_smax(reg) >= BPF_MAX_VAR_OFF ||
+ reg_smin(reg) <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "invalid unbounded variable-offset%s stack %s\n",
+ err_extra, reg_arg_name(env, argno));
return -EACCES;
}
- min_off = reg->smin_value + off;
- max_off = reg->smax_value + off + access_size;
+ min_off = reg_smin(reg) + off;
+ max_off = reg_smax(reg) + off + access_size;
}
err = check_stack_slot_within_bounds(env, min_off, state, type);
@@ -6268,14 +6010,14 @@ static int check_stack_access_within_bounds(
if (err) {
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid%s stack R%d off=%lld size=%d\n",
- err_extra, regno, min_off, access_size);
+ verbose(env, "invalid%s stack %s off=%lld size=%d\n",
+ err_extra, reg_arg_name(env, argno), min_off, access_size);
} else {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
- err_extra, regno, tn_buf, off, access_size);
+ verbose(env, "invalid variable-offset%s stack %s var_off=%s off=%d size=%d\n",
+ err_extra, reg_arg_name(env, argno), tn_buf, off, access_size);
}
return err;
}
@@ -6320,12 +6062,11 @@ static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val)
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno,
int off, int bpf_size, enum bpf_access_type t,
int value_regno, bool strict_alignment_once, bool is_ldsx)
{
struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = regs + regno;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -6338,11 +6079,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (reg->type == PTR_TO_MAP_KEY) {
if (t == BPF_WRITE) {
- verbose(env, "write to change key R%d not allowed\n", regno);
+ verbose(env, "write to change key %s not allowed\n",
+ reg_arg_name(env, argno));
return -EACCES;
}
- err = check_mem_region_access(env, regno, off, size,
+ err = check_mem_region_access(env, reg, argno, off, size,
reg->map_ptr->key_size, false);
if (err)
return err;
@@ -6356,17 +6098,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- err = check_map_access_type(env, regno, off, size, t);
+ err = check_map_access_type(env, reg, off, size, t);
if (err)
return err;
- err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
+ err = check_map_access(env, reg, argno, off, size, false, ACCESS_DIRECT);
if (err)
return err;
if (tnum_is_const(reg->var_off))
kptr_field = btf_record_find(reg->map_ptr->record,
off + reg->var_off.value, BPF_KPTR | BPF_UPTR);
if (kptr_field) {
- err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
+ err = check_map_kptr_access(env, value_regno, insn_idx, kptr_field);
} else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
@@ -6394,7 +6136,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
size);
return -EACCES;
}
- copy_register_state(&regs[value_regno], reg);
+ regs[value_regno] = *reg;
add_scalar_to_reg(&regs[value_regno], off);
regs[value_regno].type = PTR_TO_INSN;
} else {
@@ -6406,14 +6148,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED);
if (type_may_be_null(reg->type)) {
- verbose(env, "R%d invalid mem access '%s'\n", regno,
+ verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno),
reg_type_str(env, reg->type));
return -EACCES;
}
if (t == BPF_WRITE && rdonly_mem) {
- verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str(env, reg->type));
+ verbose(env, "%s cannot write into %s\n",
+ reg_arg_name(env, argno), reg_type_str(env, reg->type));
return -EACCES;
}
@@ -6428,7 +6170,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* instructions, hence no need to check bounds in that case.
*/
if (!rdonly_untrusted)
- err = check_mem_region_access(env, regno, off, size,
+ err = check_mem_region_access(env, reg, argno, off, size,
reg->mem_size, false);
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
@@ -6446,7 +6188,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_ctx_access(env, insn_idx, regno, off, size, t, &info);
+ err = check_ctx_access(env, insn_idx, reg, argno, off, size, t, &info);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
* PTR_TO_PACKET[_META,_END]. In the latter
@@ -6464,8 +6206,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (type_may_be_null(info.reg_type))
- regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
* insn. When the dst is PTR, it is for sure not
@@ -6475,23 +6215,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (base_type(info.reg_type) == PTR_TO_BTF_ID) {
regs[value_regno].btf = info.btf;
regs[value_regno].btf_id = info.btf_id;
- regs[value_regno].ref_obj_id = info.ref_obj_id;
+ regs[value_regno].id = info.ref_id;
}
+ if (type_may_be_null(info.reg_type) && !regs[value_regno].id)
+ regs[value_regno].id = ++env->id_gen;
}
regs[value_regno].type = info.reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
/* Basic bounds checks. */
- err = check_stack_access_within_bounds(env, regno, off, size, t);
+ err = check_stack_access_within_bounds(env, reg, argno, off, size, t);
if (err)
return err;
if (t == BPF_READ)
- err = check_stack_read(env, regno, off, size,
+ err = check_stack_read(env, reg, argno, off, size,
value_regno);
else
- err = check_stack_write(env, regno, off, size,
+ err = check_stack_write(env, reg, off, size,
value_regno, insn_idx);
} else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
@@ -6504,7 +6246,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
value_regno);
return -EACCES;
}
- err = check_packet_access(env, regno, off, size, false);
+ err = check_packet_access(env, reg, argno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_FLOW_KEYS) {
@@ -6515,28 +6257,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_flow_keys_access(env, off, size);
+ err = check_flow_keys_access(env, reg, argno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str(env, reg->type));
+ verbose(env, "%s cannot write into %s\n",
+ reg_arg_name(env, argno), reg_type_str(env, reg->type));
return -EACCES;
}
- err = check_sock_access(env, insn_idx, regno, off, size, t);
+ err = check_sock_access(env, insn_idx, reg, argno, off, size, t);
if (!err && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_TP_BUFFER) {
- err = check_tp_buffer_access(env, reg, regno, off, size);
+ err = check_tp_buffer_access(env, reg, argno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (base_type(reg->type) == PTR_TO_BTF_ID &&
!type_may_be_null(reg->type)) {
- err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
+ err = check_ptr_to_btf_access(env, regs, reg, argno, off, size, t,
value_regno);
} else if (reg->type == CONST_PTR_TO_MAP) {
- err = check_ptr_to_map_access(env, regs, regno, off, size, t,
+ err = check_ptr_to_map_access(env, regs, reg, argno, off, size, t,
value_regno);
} else if (base_type(reg->type) == PTR_TO_BUF &&
!type_may_be_null(reg->type)) {
@@ -6545,8 +6287,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (rdonly_mem) {
if (t == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str(env, reg->type));
+ verbose(env, "%s cannot write into %s\n",
+ reg_arg_name(env, argno), reg_type_str(env, reg->type));
return -EACCES;
}
max_access = &env->prog->aux->max_rdonly_access;
@@ -6554,7 +6296,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
max_access = &env->prog->aux->max_rdwr_access;
}
- err = check_buffer_access(env, reg, regno, off, size, false,
+ err = check_buffer_access(env, reg, argno, off, size, false,
max_access);
if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
@@ -6563,7 +6305,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else {
- verbose(env, "R%d invalid mem access '%s'\n", regno,
+ verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno),
reg_type_str(env, reg->type));
return -EACCES;
}
@@ -6586,10 +6328,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
bool strict_alignment_once, bool is_ldsx,
bool allow_trust_mismatch, const char *ctx)
{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = cur_regs(env);
enum bpf_reg_type src_reg_type;
int err;
+ /* Handle stack arg read */
+ if (is_stack_arg_ldx(insn)) {
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+ return check_stack_arg_read(env, state, insn->off, insn->dst_reg);
+ }
+
/* check src operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -6605,7 +6357,7 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Check if (src_reg + off) is readable. The state of dst_reg will be
* updated by this call.
*/
- err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off,
+ err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, argno_from_reg(insn->src_reg), insn->off,
BPF_SIZE(insn->code), BPF_READ, insn->dst_reg,
strict_alignment_once, is_ldsx);
err = err ?: save_aux_ptr_type(env, src_reg_type,
@@ -6618,10 +6370,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
bool strict_alignment_once)
{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = cur_regs(env);
enum bpf_reg_type dst_reg_type;
int err;
+ /* Handle stack arg write */
+ if (is_stack_arg_stx(insn)) {
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ return check_stack_arg_write(env, state, insn->off, regs + insn->src_reg);
+ }
+
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -6635,7 +6397,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
dst_reg_type = regs[insn->dst_reg].type;
/* Check if (dst_reg + off) is writeable. */
- err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off,
BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg,
strict_alignment_once, false);
err = err ?: save_aux_ptr_type(env, dst_reg_type, false);
@@ -6646,6 +6408,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
static int check_atomic_rmw(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
+ struct bpf_reg_state *dst_reg;
int load_reg;
int err;
@@ -6707,13 +6470,15 @@ static int check_atomic_rmw(struct bpf_verifier_env *env,
load_reg = -1;
}
+ dst_reg = cur_regs(env) + insn->dst_reg;
+
/* Check whether we can read the memory, with second call for fetch
* case to simulate the register fill.
*/
- err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off,
BPF_SIZE(insn->code), BPF_READ, -1, true, false);
if (!err && load_reg >= 0)
- err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg),
insn->off, BPF_SIZE(insn->code),
BPF_READ, load_reg, true, false);
if (err)
@@ -6725,7 +6490,7 @@ static int check_atomic_rmw(struct bpf_verifier_env *env,
return err;
}
/* Check whether we can write into the same memory. */
- err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
if (err)
return err;
@@ -6814,11 +6579,10 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
* read offsets are marked as read.
*/
static int check_stack_range_initialized(
- struct bpf_verifier_env *env, int regno, int off,
+ struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off,
int access_size, bool zero_size_allowed,
enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = bpf_func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
/* Some accesses can write anything into the stack, others are
@@ -6840,7 +6604,7 @@ static int check_stack_range_initialized(
return -EACCES;
}
- err = check_stack_access_within_bounds(env, regno, off, access_size, type);
+ err = check_stack_access_within_bounds(env, reg, argno, off, access_size, type);
if (err)
return err;
@@ -6857,8 +6621,8 @@ static int check_stack_range_initialized(
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
- regno, tn_buf);
+ verbose(env, "%s variable offset stack access prohibited for !root, var_off=%s\n",
+ reg_arg_name(env, argno), tn_buf);
return -EACCES;
}
/* Only initialized buffer on stack is allowed to be accessed
@@ -6870,8 +6634,8 @@ static int check_stack_range_initialized(
if (meta && meta->raw_mode)
meta = NULL;
- min_off = reg->smin_value + off;
- max_off = reg->smax_value + off;
+ min_off = reg_smin(reg) + off;
+ max_off = reg_smax(reg) + off;
}
if (meta && meta->raw_mode) {
@@ -6901,7 +6665,7 @@ static int check_stack_range_initialized(
}
}
meta->access_size = access_size;
- meta->regno = regno;
+ meta->regno = reg_from_argno(argno);
return 0;
}
@@ -6941,17 +6705,17 @@ static int check_stack_range_initialized(
if (*stype == STACK_POISON) {
if (allow_poison)
goto mark;
- verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n",
- regno, min_off, i - min_off, access_size);
+ verbose(env, "reading from stack %s off %d+%d size %d, slot poisoned by dead code elimination\n",
+ reg_arg_name(env, argno), min_off, i - min_off, access_size);
} else if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
- regno, min_off, i - min_off, access_size);
+ verbose(env, "invalid read from stack %s off %d+%d size %d\n",
+ reg_arg_name(env, argno), min_off, i - min_off, access_size);
} else {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
- regno, tn_buf, i - min_off, access_size);
+ verbose(env, "invalid read from stack %s var_off %s+%d size %d\n",
+ reg_arg_name(env, argno), tn_buf, i - min_off, access_size);
}
return -EACCES;
mark:
@@ -6960,48 +6724,48 @@ mark:
return 0;
}
-static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
+static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
int access_size, enum bpf_access_type access_type,
bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env);
u32 *max_access;
switch (base_type(reg->type)) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
- return check_packet_access(env, regno, 0, access_size,
+ return check_packet_access(env, reg, argno, 0, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
if (access_type == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n", regno,
- reg_type_str(env, reg->type));
+ verbose(env, "%s cannot write into %s\n",
+ reg_arg_name(env, argno), reg_type_str(env, reg->type));
return -EACCES;
}
- return check_mem_region_access(env, regno, 0, access_size,
+ return check_mem_region_access(env, reg, argno, 0, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
- if (check_map_access_type(env, regno, 0, access_size, access_type))
+ if (check_map_access_type(env, reg, 0, access_size, access_type))
return -EACCES;
- return check_map_access(env, regno, 0, access_size,
+ return check_map_access(env, reg, argno, 0, access_size,
zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
if (type_is_rdonly_mem(reg->type)) {
if (access_type == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n", regno,
- reg_type_str(env, reg->type));
+ verbose(env, "%s cannot write into %s\n",
+ reg_arg_name(env, argno), reg_type_str(env, reg->type));
return -EACCES;
}
}
- return check_mem_region_access(env, regno, 0,
+ return check_mem_region_access(env, reg, argno, 0,
access_size, reg->mem_size,
zero_size_allowed);
case PTR_TO_BUF:
if (type_is_rdonly_mem(reg->type)) {
if (access_type == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n", regno,
- reg_type_str(env, reg->type));
+ verbose(env, "%s cannot write into %s\n",
+ reg_arg_name(env, argno), reg_type_str(env, reg->type));
return -EACCES;
}
@@ -7009,26 +6773,26 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
} else {
max_access = &env->prog->aux->max_rdwr_access;
}
- return check_buffer_access(env, reg, regno, 0,
+ return check_buffer_access(env, reg, argno, 0,
access_size, zero_size_allowed,
max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
- env,
- regno, 0, access_size,
+ env, reg,
+ argno, 0, access_size,
zero_size_allowed, access_type, meta);
case PTR_TO_BTF_ID:
- return check_ptr_to_btf_access(env, regs, regno, 0,
- access_size, BPF_READ, -1);
+ return check_ptr_to_btf_access(env, regs, reg, argno, 0,
+ access_size, access_type, -1);
case PTR_TO_CTX:
/* Only permit reading or writing syscall context using helper calls. */
if (is_var_ctx_off_allowed(env->prog)) {
- int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX,
+ int err = check_mem_region_access(env, reg, argno, 0, access_size, U16_MAX,
zero_size_allowed);
if (err)
return err;
- if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size)
- env->prog->aux->max_ctx_offset = reg->umax_value + access_size;
+ if (env->prog->aux->max_ctx_offset < reg_umax(reg) + access_size)
+ env->prog->aux->max_ctx_offset = reg_umax(reg) + access_size;
return 0;
}
fallthrough;
@@ -7038,7 +6802,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
bpf_register_is_null(reg))
return 0;
- verbose(env, "R%d type=%s ", regno,
+ verbose(env, "%s type=%s ", reg_arg_name(env, argno),
reg_type_str(env, reg->type));
verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
return -EACCES;
@@ -7048,12 +6812,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
/* verify arguments to helpers or kfuncs consisting of a pointer and an access
* size.
*
- * @regno is the register containing the access size. regno-1 is the register
- * containing the pointer.
+ * @mem_reg contains the pointer, @size_reg contains the access size.
*/
static int check_mem_size_reg(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno,
- enum bpf_access_type access_type,
+ struct bpf_reg_state *mem_reg,
+ struct bpf_reg_state *size_reg, argno_t mem_argno,
+ argno_t size_argno, enum bpf_access_type access_type,
bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
@@ -7067,42 +6831,48 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
* out. Only upper bounds can be learned because retval is an
* int type and negative retvals are allowed.
*/
- meta->msize_max_value = reg->umax_value;
+ meta->msize_max_value = reg_umax(size_reg);
/* The register is SCALAR_VALUE; the access check happens using
* its boundaries. For unprivileged variable accesses, disable
* raw mode so that the program is required to initialize all
* the memory that the helper could just partially fill up.
*/
- if (!tnum_is_const(reg->var_off))
+ if (!tnum_is_const(size_reg->var_off))
meta = NULL;
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
+ if (reg_smin(size_reg) < 0) {
+ verbose(env, "%s min value is negative, either use unsigned or 'var &= const'\n",
+ reg_arg_name(env, size_argno));
return -EACCES;
}
- if (reg->umin_value == 0 && !zero_size_allowed) {
- verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
- regno, reg->umin_value, reg->umax_value);
+ if (reg_umin(size_reg) == 0 && !zero_size_allowed) {
+ verbose(env, "%s invalid zero-sized read: u64=[%lld,%lld]\n",
+ reg_arg_name(env, size_argno), reg_umin(size_reg), reg_umax(size_reg));
return -EACCES;
}
- if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
+ if (reg_umax(size_reg) >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "%s unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ reg_arg_name(env, size_argno));
return -EACCES;
}
- err = check_helper_mem_access(env, regno - 1, reg->umax_value,
+ err = check_helper_mem_access(env, mem_reg, mem_argno, reg_umax(size_reg),
access_type, zero_size_allowed, meta);
- if (!err)
- err = mark_chain_precision(env, regno);
+ if (!err) {
+ int regno = reg_from_argno(size_argno);
+
+ if (regno >= 0)
+ err = mark_chain_precision(env, regno);
+ else
+ err = mark_stack_arg_precision(env, arg_idx_from_argno(size_argno));
+ }
return err;
}
static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- u32 regno, u32 mem_size)
+ argno_t argno, u32 mem_size)
{
bool may_be_null = type_may_be_null(reg->type);
struct bpf_reg_state saved_reg;
@@ -7111,6 +6881,12 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
if (bpf_register_is_null(reg))
return 0;
+ if (mem_size > S32_MAX) {
+ verbose(env, "%s memory size %u is too large\n",
+ reg_arg_name(env, argno), mem_size);
+ return -EACCES;
+ }
+
/* Assuming that the register contains a value check if the memory
* access is safe. Temporarily save and restore the register's state as
* the conversion shouldn't be visible to a caller.
@@ -7122,8 +6898,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size;
- err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL);
- err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL);
+ err = check_helper_mem_access(env, reg, argno, size, BPF_READ, true, NULL);
+ err = err ?: check_helper_mem_access(env, reg, argno, size, BPF_WRITE, true, NULL);
if (may_be_null)
*reg = saved_reg;
@@ -7131,17 +6907,14 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
return err;
}
-static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- u32 regno)
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg,
+ struct bpf_reg_state *size_reg, argno_t mem_argno, argno_t size_argno)
{
- struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
bool may_be_null = type_may_be_null(mem_reg->type);
struct bpf_reg_state saved_reg;
struct bpf_call_arg_meta meta;
int err;
- WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
-
memset(&meta, 0, sizeof(meta));
if (may_be_null) {
@@ -7149,8 +6922,8 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
mark_ptr_not_null_reg(mem_reg);
}
- err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta);
- err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta);
+ err = check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_READ, true, &meta);
+ err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_WRITE, true, &meta);
if (may_be_null)
*mem_reg = saved_reg;
@@ -7186,11 +6959,10 @@ enum {
* env->cur_state->active_locks remembers which map value element or allocated
* object got locked and clears it after bpf_spin_unlock.
*/
-static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
+static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int flags)
{
bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
- struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
bool is_irq = flags & PROCESS_LOCK_IRQ;
@@ -7203,8 +6975,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
if (!is_const) {
verbose(env,
- "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n",
- regno, lock_str);
+ "%s doesn't have constant offset. %s_lock has to be at the constant offset\n",
+ reg_arg_name(env, argno), lock_str);
return -EINVAL;
}
if (reg->type == PTR_TO_MAP_VALUE) {
@@ -7303,11 +7075,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
}
/* Check if @regno is a pointer to a specific field in a map value */
-static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
+static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
enum btf_field_type field_type,
struct bpf_map_desc *map_desc)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
bool is_const = tnum_is_const(reg->var_off);
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
@@ -7316,8 +7087,8 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
if (!is_const) {
verbose(env,
- "R%d doesn't have constant offset. %s has to be at the constant offset\n",
- regno, struct_name);
+ "%s doesn't have constant offset. %s has to be at the constant offset\n",
+ reg_arg_name(env, argno), struct_name);
return -EINVAL;
}
if (!map->btf) {
@@ -7357,26 +7128,26 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
return 0;
}
-static int process_timer_func(struct bpf_verifier_env *env, int regno,
+static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
struct bpf_map_desc *map)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
return -EOPNOTSUPP;
}
- return check_map_field_pointer(env, regno, BPF_TIMER, map);
+ return check_map_field_pointer(env, reg, argno, BPF_TIMER, map);
}
-static int process_timer_helper(struct bpf_verifier_env *env, int regno,
+static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
struct bpf_call_arg_meta *meta)
{
- return process_timer_func(env, regno, &meta->map);
+ return process_timer_func(env, reg, argno, &meta->map);
}
-static int process_timer_kfunc(struct bpf_verifier_env *env, int regno,
+static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
struct bpf_kfunc_call_arg_meta *meta)
{
- return process_timer_func(env, regno, &meta->map);
+ return process_timer_func(env, reg, argno, &meta->map);
}
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
@@ -7427,52 +7198,42 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
return 0;
}
-/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+/*
+ * Validate dynptr arguments for helper, kfunc and subprog.
+ *
+ * @dynptr is both input and output. It is populated when the argument is
+ * tagged with MEM_UNINIT (i.e., the dynptr argument that will be constructed)
+ * and consumed when the argument is expecting to be an initialized dynptr.
+ * @parent_id is used to track the referenced parent object (e.g., file or skb in
+ * qdisc program) when constructing a dynptr.
+ *
+ * There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
* which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
*
* In both cases we deal with the first 8 bytes, but need to mark the next 8
* bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
* CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
*
- * Mutability of bpf_dynptr is at two levels, one is at the level of struct
- * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
- * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
- * mutate the view of the dynptr and also possibly destroy it. In the latter
- * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
- * memory that dynptr points to.
- *
- * The verifier will keep track both levels of mutation (bpf_dynptr's in
- * reg->type and the memory's in reg->dynptr.type), but there is no support for
- * readonly dynptr view yet, hence only the first case is tracked and checked.
- *
- * This is consistent with how C applies the const modifier to a struct object,
- * where the pointer itself inside bpf_dynptr becomes const but not what it
- * points to.
- *
- * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
- * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ * Mutability of bpf_dynptr is at two levels: the dynptr and the memory the
+ * dynptr points to. At the first level, the verifier will make sure a
+ * CONST_PTR_TO_DYNPTR cannot be reinitialized or destroyed. The mutability of
+ * a dynptr's view (i.e., start and offset) is not tracked as there is not such
+ * use case. The second level is tracked using the upper bit of bpf_dynptr->size
+ * and checked dynamically during runtime.
*/
-static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
- enum bpf_arg_type arg_type, int clone_ref_obj_id)
+static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ argno_t argno, int insn_idx, enum bpf_arg_type arg_type,
+ struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
- int err;
+ int spi, err = 0;
if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
verbose(env,
- "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
- regno - 1);
+ "%s expected pointer to stack or const struct bpf_dynptr\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
- * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
- */
- if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
- verifier_bug(env, "misconfigured dynptr helper type flags");
- return -EFAULT;
- }
-
/* MEM_UNINIT - Points to memory that is an appropriate candidate for
* constructing a mutable bpf_dynptr object.
*
@@ -7480,13 +7241,12 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
* pointing to a region of at least 16 bytes which doesn't
* contain an existing bpf_dynptr.
*
- * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
- * mutated or destroyed. However, the memory it points to
- * may be mutated.
+ * OBJ_RELEASE - Points to a initialized bpf_dynptr that will be
+ * destroyed.
*
- * None - Points to a initialized dynptr that can be mutated and
- * destroyed, including mutation of the memory it points
- * to.
+ * None - Points to a initialized dynptr that cannot be
+ * reinitialized or destroyed. However, the view of the
+ * dynptr and the memory it points to may be mutated.
*/
if (arg_type & MEM_UNINIT) {
int i;
@@ -7498,45 +7258,58 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
/* we write BPF_DW bits (8 bytes) at a time */
for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
- err = check_mem_access(env, insn_idx, regno,
+ err = check_mem_access(env, insn_idx, reg, argno,
i, BPF_DW, BPF_WRITE, -1, false, false);
if (err)
return err;
}
- err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
- } else /* MEM_RDONLY and None case from above */ {
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, ref_obj, dynptr);
+ } else /* OBJ_RELEASE and None case from above */ {
/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
- if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
- verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+ if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) {
+ verbose(env, "CONST_PTR_TO_DYNPTR cannot be released\n");
return -EINVAL;
}
if (!is_dynptr_reg_valid_init(env, reg)) {
- verbose(env,
- "Expected an initialized dynptr as arg #%d\n",
- regno - 1);
+ verbose(env, "Expected an initialized dynptr as %s\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
- if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+ /* Fold modifiers (in this case, OBJ_RELEASE) when checking expected type */
+ if (!is_dynptr_type_expected(env, reg, arg_type & ~OBJ_RELEASE)) {
verbose(env,
- "Expected a dynptr of type %s as arg #%d\n",
- dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
+ "Expected a dynptr of type %s as %s\n",
+ dynptr_type_str(arg_to_dynptr_type(arg_type)),
+ reg_arg_name(env, argno));
return -EINVAL;
}
- err = mark_dynptr_read(env, reg);
- }
- return err;
-}
+ if (reg->type != CONST_PTR_TO_DYNPTR) {
+ struct bpf_func_state *state = bpf_func(env, reg);
-static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
-{
- struct bpf_func_state *state = bpf_func(env, reg);
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
- return state->stack[spi].spilled_ptr.ref_obj_id;
+ /*
+ * For CONST_PTR_TO_DYNPTR, reg is already scratched by check_reg_arg
+ * in check_helper_call and mark_btf_func_reg_size in check_kfunc_call.
+ */
+ mark_stack_slots_scratched(env, spi, BPF_DYNPTR_NR_SLOTS);
+
+ reg = &state->stack[spi].spilled_ptr;
+ }
+
+ if (dynptr) {
+ dynptr->type = reg->dynptr.type;
+ dynptr->id = reg->id;
+ dynptr->parent_id = reg->parent_id;
+ }
+ }
+ return err;
}
static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
@@ -7568,15 +7341,17 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
return btf_param_match_suffix(meta->btf, arg, "__iter");
}
-static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx,
struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
+ struct bpf_func_state *state = bpf_func(env, reg);
const struct btf_type *t;
+ u32 arg_idx = arg_idx_from_argno(argno);
int spi, err, i, nr_slots, btf_id;
if (reg->type != PTR_TO_STACK) {
- verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
+ verbose(env, "%s expected pointer to an iterator on stack\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
@@ -7586,9 +7361,10 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
* to any kfunc, if arg has "__iter" suffix, we need to be a bit more
* conservative here.
*/
- btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
+ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, arg_idx);
if (btf_id < 0) {
- verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
+ verbose(env, "expected valid iter pointer as %s\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
t = btf_type_by_id(meta->btf, btf_id);
@@ -7597,13 +7373,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
if (is_iter_new_kfunc(meta)) {
/* bpf_iter_<type>_new() expects pointer to uninit iter state */
if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
- verbose(env, "expected uninitialized iter_%s as arg #%d\n",
- iter_type_str(meta->btf, btf_id), regno - 1);
+ verbose(env, "expected uninitialized iter_%s as %s\n",
+ iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno));
return -EINVAL;
}
for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
- err = check_mem_access(env, insn_idx, regno,
+ err = check_mem_access(env, insn_idx, reg, argno,
i, BPF_DW, BPF_WRITE, -1, false, false);
if (err)
return err;
@@ -7621,8 +7397,8 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
case 0:
break;
case -EINVAL:
- verbose(env, "expected an initialized iter_%s as arg #%d\n",
- iter_type_str(meta->btf, btf_id), regno - 1);
+ verbose(env, "expected an initialized iter_%s as %s\n",
+ iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno));
return err;
case -EPROTO:
verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
@@ -7635,14 +7411,12 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
if (spi < 0)
return spi;
- err = mark_iter_read(env, reg, spi, nr_slots);
- if (err)
- return err;
+ mark_stack_slots_scratched(env, spi, nr_slots);
/* remember meta->iter info for process_iter_next_call() */
meta->iter.spi = spi;
meta->iter.frameno = reg->frameno;
- meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+ update_ref_obj(&meta->ref_obj, &state->stack[spi].spilled_ptr);
if (is_iter_destroy_kfunc(meta)) {
err = unmark_stack_slots_iter(env, reg, nr_slots);
@@ -8042,12 +7816,11 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
-static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
+static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
enum bpf_arg_type arg_type,
const u32 *arg_btf_id,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
enum bpf_reg_type expected, type = reg->type;
const struct bpf_reg_types *compatible;
int i, j, err;
@@ -8078,7 +7851,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
type &= ~DYNPTR_TYPE_FLAG_MASK;
/* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
- if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && reg_from_argno(argno) == BPF_REG_2) {
type &= ~MEM_ALLOC;
type &= ~MEM_PERCPU;
}
@@ -8092,7 +7865,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
goto found;
}
- verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
+ verbose(env, "%s type=%s expected=", reg_arg_name(env, argno), reg_type_str(env, reg->type));
for (j = 0; j + 1 < i; j++)
verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
@@ -8105,9 +7878,9 @@ found:
if (compatible == &mem_types) {
if (!(arg_type & MEM_RDONLY)) {
verbose(env,
- "%s() may write into memory pointed by R%d type=%s\n",
+ "%s() may write into memory pointed by %s type=%s\n",
func_id_name(meta->func_id),
- regno, reg_type_str(env, reg->type));
+ reg_arg_name(env, argno), reg_type_str(env, reg->type));
return -EACCES;
}
return 0;
@@ -8130,7 +7903,8 @@ found:
if (type_may_be_null(reg->type) &&
(!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
- verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+ verbose(env, "Possibly NULL pointer passed to helper %s\n",
+ reg_arg_name(env, argno));
return -EACCES;
}
@@ -8143,25 +7917,26 @@ found:
}
if (meta->func_id == BPF_FUNC_kptr_xchg) {
- if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ if (map_kptr_match_type(env, meta->kptr_field, reg, reg_from_argno(argno)))
return -EACCES;
} else {
if (arg_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
- verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
- regno);
+ verbose(env, "%s has non-overwritten BPF_PTR_POISON type\n",
+ reg_arg_name(env, argno));
return -EACCES;
}
- err = __check_ptr_off_reg(env, reg, regno, true);
+ err = __check_ptr_off_reg(env, reg, argno, true);
if (err)
return err;
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id,
reg->var_off.value, btf_vmlinux, *arg_btf_id,
strict_type_match)) {
- verbose(env, "R%d is of type %s but %s is expected\n",
- regno, btf_type_name(reg->btf, reg->btf_id),
+ verbose(env, "%s is of type %s but %s is expected\n",
+ reg_arg_name(env, argno),
+ btf_type_name(reg->btf, reg->btf_id),
btf_type_name(btf_vmlinux, *arg_btf_id));
return -EACCES;
}
@@ -8178,8 +7953,11 @@ found:
return -EFAULT;
}
/* Check if local kptr in src arg matches kptr in dst arg */
- if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
- if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ int regno = reg_from_argno(argno);
+
+ if (regno == BPF_REG_2 &&
+ map_kptr_match_type(env, meta->kptr_field, reg, regno))
return -EACCES;
}
break;
@@ -8213,7 +7991,7 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
}
static int check_func_arg_reg_off(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno,
+ const struct bpf_reg_state *reg, argno_t argno,
enum bpf_arg_type arg_type)
{
u32 type = reg->type;
@@ -8221,7 +7999,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
/* When referenced register is passed to release function, its fixed
* offset must be 0.
*
- * We will check arg_type_is_release reg has ref_obj_id when storing
+ * We will check arg_type_is_release reg has id when storing
* meta->release_regno.
*/
if (arg_type_is_release(arg_type)) {
@@ -8239,8 +8017,8 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
* to give the user a better error message.
*/
if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) {
- verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
- regno);
+ verbose(env, "%s must have zero offset when passed to release func or trusted arg to kfunc\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
}
@@ -8276,7 +8054,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
* cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
* still need to do checks instead of returning.
*/
- return __check_ptr_off_reg(env, reg, regno, true);
+ return __check_ptr_off_reg(env, reg, argno, true);
case PTR_TO_CTX:
/*
* Allow fixed and variable offsets for syscall context, but
@@ -8288,78 +8066,12 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
return 0;
fallthrough;
default:
- return __check_ptr_off_reg(env, reg, regno, false);
+ return __check_ptr_off_reg(env, reg, argno, false);
}
}
-static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
- const struct bpf_func_proto *fn,
- struct bpf_reg_state *regs)
-{
- struct bpf_reg_state *state = NULL;
- int i;
-
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
- if (arg_type_is_dynptr(fn->arg_type[i])) {
- if (state) {
- verbose(env, "verifier internal error: multiple dynptr args\n");
- return NULL;
- }
- state = &regs[BPF_REG_1 + i];
- }
-
- if (!state)
- verbose(env, "verifier internal error: no dynptr arg found\n");
-
- return state;
-}
-
-static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
-{
- struct bpf_func_state *state = bpf_func(env, reg);
- int spi;
-
- if (reg->type == CONST_PTR_TO_DYNPTR)
- return reg->id;
- spi = dynptr_get_spi(env, reg);
- if (spi < 0)
- return spi;
- return state->stack[spi].spilled_ptr.id;
-}
-
-static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
-{
- struct bpf_func_state *state = bpf_func(env, reg);
- int spi;
-
- if (reg->type == CONST_PTR_TO_DYNPTR)
- return reg->ref_obj_id;
- spi = dynptr_get_spi(env, reg);
- if (spi < 0)
- return spi;
- return state->stack[spi].spilled_ptr.ref_obj_id;
-}
-
-static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg)
-{
- struct bpf_func_state *state = bpf_func(env, reg);
- int spi;
-
- if (reg->type == CONST_PTR_TO_DYNPTR)
- return reg->dynptr.type;
-
- spi = bpf_get_spi(reg->var_off.value);
- if (spi < 0) {
- verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
- return BPF_DYNPTR_TYPE_INVALID;
- }
-
- return state->stack[spi].spilled_ptr.dynptr.type;
-}
-
-static int check_reg_const_str(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno)
+static int check_arg_const_str(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, argno_t argno)
{
struct bpf_map *map = reg->map_ptr;
int err;
@@ -8371,17 +8083,18 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
return -EINVAL;
if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
- verbose(env, "R%d points to insn_array map which cannot be used as const string\n", regno);
+ verbose(env, "%s points to insn_array map which cannot be used as const string\n",
+ reg_arg_name(env, argno));
return -EACCES;
}
if (!bpf_map_is_rdonly(map)) {
- verbose(env, "R%d does not point to a readonly map'\n", regno);
+ verbose(env, "%s does not point to a readonly map'\n", reg_arg_name(env, argno));
return -EACCES;
}
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "R%d is not a constant address'\n", regno);
+ verbose(env, "%s is not a constant address'\n", reg_arg_name(env, argno));
return -EACCES;
}
@@ -8390,7 +8103,7 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
return -EACCES;
}
- err = check_map_access(env, regno, 0,
+ err = check_map_access(env, reg, argno, 0,
map->value_size - reg->var_off.value, false,
ACCESS_HELPER);
if (err)
@@ -8472,7 +8185,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
return 0;
}
-static bool can_elide_value_nullness(enum bpf_map_type type);
+static bool can_elide_value_nullness(const struct bpf_map *map);
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
@@ -8482,6 +8195,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
u32 regno = BPF_REG_1 + arg;
struct bpf_reg_state *reg = reg_state(env, regno);
enum bpf_arg_type arg_type = fn->arg_type[arg];
+ argno_t argno = argno_from_arg(arg + 1);
enum bpf_reg_type type = reg->type;
u32 *arg_btf_id = NULL;
u32 key_size;
@@ -8526,56 +8240,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
arg_btf_id = fn->arg_btf_id[arg];
- err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
+ err = check_reg_type(env, reg, argno_from_reg(regno), arg_type, arg_btf_id, meta);
if (err)
return err;
- err = check_func_arg_reg_off(env, reg, regno, arg_type);
+ err = check_func_arg_reg_off(env, reg, argno_from_reg(regno), arg_type);
if (err)
return err;
skip_type_check:
- if (arg_type_is_release(arg_type)) {
- if (arg_type_is_dynptr(arg_type)) {
- struct bpf_func_state *state = bpf_func(env, reg);
- int spi;
-
- /* Only dynptr created on stack can be released, thus
- * the get_spi and stack state checks for spilled_ptr
- * should only be done before process_dynptr_func for
- * PTR_TO_STACK.
- */
- if (reg->type == PTR_TO_STACK) {
- spi = dynptr_get_spi(env, reg);
- if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
- verbose(env, "arg %d is an unacquired reference\n", regno);
- return -EINVAL;
- }
- } else {
- verbose(env, "cannot release unowned const bpf_dynptr\n");
- return -EINVAL;
- }
- } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) {
- verbose(env, "R%d must be referenced when passed to release function\n",
- regno);
- return -EINVAL;
- }
- if (meta->release_regno) {
- verifier_bug(env, "more than one release argument");
- return -EFAULT;
- }
- meta->release_regno = regno;
+ if (arg_type_is_release(arg_type) && !arg_type_is_dynptr(arg_type) &&
+ !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) {
+ verbose(env, "release helper %s expects referenced PTR_TO_BTF_ID passed to %s\n",
+ func_id_name(meta->func_id), reg_arg_name(env, argno));
+ return -EINVAL;
}
- if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
- if (meta->ref_obj_id) {
- verbose(env, "more than one arg with ref_obj_id R%d %u %u",
- regno, reg->ref_obj_id,
- meta->ref_obj_id);
- return -EACCES;
- }
- meta->ref_obj_id = reg->ref_obj_id;
- }
+ if (reg_is_referenced(env, reg))
+ update_ref_obj(&meta->ref_obj, reg);
switch (base_type(arg_type)) {
case ARG_CONST_MAP_PTR:
@@ -8619,10 +8301,10 @@ skip_type_check:
return -EFAULT;
}
key_size = meta->map.ptr->key_size;
- err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
+ err = check_helper_mem_access(env, reg, argno_from_reg(regno), key_size, BPF_READ, false, NULL);
if (err)
return err;
- if (can_elide_value_nullness(meta->map.ptr->map_type)) {
+ if (can_elide_value_nullness(meta->map.ptr)) {
err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
if (err < 0) {
meta->const_map_key = -1;
@@ -8646,7 +8328,7 @@ skip_type_check:
return -EFAULT;
}
meta->raw_mode = arg_type & MEM_UNINIT;
- err = check_helper_mem_access(env, regno, meta->map.ptr->value_size,
+ err = check_helper_mem_access(env, reg, argno_from_reg(regno), meta->map.ptr->value_size,
arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
false, meta);
break;
@@ -8664,11 +8346,11 @@ skip_type_check:
return -EACCES;
}
if (meta->func_id == BPF_FUNC_spin_lock) {
- err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK);
+ err = process_spin_lock(env, reg, argno_from_reg(regno), PROCESS_SPIN_LOCK);
if (err)
return err;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
- err = process_spin_lock(env, regno, 0);
+ err = process_spin_lock(env, reg, argno_from_reg(regno), 0);
if (err)
return err;
} else {
@@ -8677,7 +8359,7 @@ skip_type_check:
}
break;
case ARG_PTR_TO_TIMER:
- err = process_timer_helper(env, regno, meta);
+ err = process_timer_helper(env, reg, argno_from_reg(regno), meta);
if (err)
return err;
break;
@@ -8690,7 +8372,7 @@ skip_type_check:
*/
meta->raw_mode = arg_type & MEM_UNINIT;
if (arg_type & MEM_FIXED_SIZE) {
- err = check_helper_mem_access(env, regno, fn->arg_size[arg],
+ err = check_helper_mem_access(env, reg, argno_from_reg(regno), fn->arg_size[arg],
arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
false, meta);
if (err)
@@ -8700,19 +8382,22 @@ skip_type_check:
}
break;
case ARG_CONST_SIZE:
- err = check_mem_size_reg(env, reg, regno,
+ err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1),
+ argno_from_reg(regno),
fn->arg_type[arg - 1] & MEM_WRITE ?
BPF_WRITE : BPF_READ,
false, meta);
break;
case ARG_CONST_SIZE_OR_ZERO:
- err = check_mem_size_reg(env, reg, regno,
+ err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1),
+ argno_from_reg(regno),
fn->arg_type[arg - 1] & MEM_WRITE ?
BPF_WRITE : BPF_READ,
true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
+ err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, &meta->ref_obj,
+ &meta->dynptr);
if (err)
return err;
break;
@@ -8729,7 +8414,7 @@ skip_type_check:
break;
case ARG_PTR_TO_CONST_STR:
{
- err = check_reg_const_str(env, reg, regno);
+ err = check_arg_const_str(env, reg, argno_from_reg(regno));
if (err)
return err;
break;
@@ -9131,11 +8816,29 @@ static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn)
return true;
}
-static int check_func_proto(const struct bpf_func_proto *fn)
+static bool check_proto_release_reg(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
+ enum bpf_arg_type arg_type = fn->arg_type[i];
+
+ if (arg_type_is_release(arg_type)) {
+ if (meta->release_regno)
+ return false;
+ meta->release_regno = i + 1;
+ }
+ }
+
+ return true;
+}
+
+static int check_func_proto(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
check_mem_arg_rw_flag_ok(fn) &&
+ check_proto_release_reg(fn, meta) &&
check_btf_id_ok(fn) ? 0 : -EINVAL;
}
@@ -9182,14 +8885,14 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
-static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
+static int release_reference_nomark(struct bpf_verifier_state *state, int id)
{
int i;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].type != REF_TYPE_PTR)
continue;
- if (state->refs[i].id == ref_obj_id) {
+ if (state->refs[i].id == id) {
release_reference_state(state, i);
return 0;
}
@@ -9197,26 +8900,83 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int ref_ob
return -EINVAL;
}
-/* The pointer with the specified id has released its reference to kernel
- * resources. Identify all copies of the same pointer and clear the reference.
- *
- * This is the release function corresponding to acquire_reference(). Idempotent.
- */
-static int release_reference(struct bpf_verifier_env *env, int ref_obj_id)
+static int idstack_push(struct bpf_idmap *idmap, u32 id)
+{
+ int i;
+
+ if (!id)
+ return 0;
+
+ for (i = 0; i < idmap->cnt; i++)
+ if (idmap->map[i].old == id)
+ return 0;
+
+ if (WARN_ON_ONCE(idmap->cnt >= BPF_ID_MAP_SIZE))
+ return -EFAULT;
+
+ idmap->map[idmap->cnt++].old = id;
+ return 0;
+}
+
+static int idstack_pop(struct bpf_idmap *idmap)
+{
+ if (!idmap->cnt)
+ return 0;
+
+ return idmap->map[--idmap->cnt].old;
+}
+
+/* Release id and objects derived from it iteratively in a DFS manner */
+static int release_reference(struct bpf_verifier_env *env, int id)
{
+ u32 mask = (1 << STACK_SPILL) | (1 << STACK_DYNPTR);
struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_idmap *idstack = &env->idmap_scratch;
+ struct bpf_stack_state *stack;
struct bpf_func_state *state;
struct bpf_reg_state *reg;
- int err;
+ int i, err;
- err = release_reference_nomark(vstate, ref_obj_id);
+ idstack->cnt = 0;
+ err = idstack_push(idstack, id);
if (err)
return err;
- bpf_for_each_reg_in_vstate(vstate, state, reg, ({
- if (reg->ref_obj_id == ref_obj_id)
- mark_reg_invalid(env, reg);
- }));
+ if (find_reference_state(vstate, id))
+ WARN_ON_ONCE(release_reference_nomark(vstate, id));
+
+ while ((id = idstack_pop(idstack))) {
+ /*
+ * Child references are inaccessible after parent is released,
+ * any child references that exist at this point are a leak.
+ */
+ for (i = 0; i < vstate->acquired_refs; i++) {
+ if (vstate->refs[i].type != REF_TYPE_PTR)
+ continue;
+ if (vstate->refs[i].parent_id != id)
+ continue;
+ verbose(env, "Leaking reference id=%d alloc_insn=%d. Release it first.\n",
+ vstate->refs[i].id, vstate->refs[i].insn_idx);
+ return -EINVAL;
+ }
+
+ bpf_for_each_reg_in_vstate_mask(vstate, state, reg, stack, mask, ({
+ if (reg->id != id && reg->parent_id != id)
+ continue;
+
+ /* Free objects derived from the current object */
+ if (reg->parent_id == id) {
+ err = idstack_push(idstack, reg->id);
+ if (err)
+ return err;
+ }
+
+ if (!stack || stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL)
+ mark_reg_invalid(env, reg);
+ else if (stack->slot_type[BPF_REG_SIZE - 1] == STACK_DYNPTR)
+ invalidate_dynptr(env, stack);
+ }));
+ }
return 0;
}
@@ -9232,6 +8992,42 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
}));
}
+static void invalidate_rcu_protected_refs(struct bpf_verifier_env *env)
+{
+ struct bpf_stack_state *stack;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
+
+ bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({
+ if (reg->type & MEM_RCU) {
+ reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+ reg->type |= PTR_UNTRUSTED;
+ }
+ }));
+}
+
+static int ref_convert_alloc_rcu_protected(struct bpf_verifier_env *env, u32 id)
+{
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int err;
+
+ err = release_reference_nomark(env->cur_state, id);
+
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->id != id)
+ continue;
+ if ((reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
+ reg->id = 0;
+ reg->type &= ~MEM_ALLOC;
+ reg->type |= MEM_RCU;
+ }
+ }));
+
+ return err;
+}
+
static void clear_caller_saved_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -9244,6 +9040,15 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
}
}
+static void invalidate_outgoing_stack_args(const struct bpf_verifier_env *env,
+ struct bpf_func_state *state)
+{
+ int i, nslots = state->out_stack_arg_cnt;
+
+ for (i = 0; i < nslots; i++)
+ bpf_mark_reg_not_init(env, &state->stack_arg_regs[i]);
+}
+
typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
@@ -9306,11 +9111,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
struct bpf_reg_state *regs)
{
struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_func_state *caller = cur_func(env);
struct bpf_verifier_log *log = &env->log;
+ struct ref_obj_desc ref_obj = {};
u32 i;
- int ret;
+ int ret, err;
ret = btf_prepare_func_args(env, subprog);
+ if (ret) {
+ if (bpf_in_stack_arg_cnt(sub) > 0) {
+ err = check_outgoing_stack_args(env, caller, sub->arg_cnt);
+ if (err)
+ return err;
+ }
+ return ret;
+ }
+
+ ret = check_outgoing_stack_args(env, caller, sub->arg_cnt);
if (ret)
return ret;
@@ -9318,13 +9135,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
* verifier sees.
*/
for (i = 0; i < sub->arg_cnt; i++) {
- u32 regno = i + 1;
- struct bpf_reg_state *reg = &regs[regno];
+ argno_t argno = argno_from_arg(i + 1);
+ struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i);
struct bpf_subprog_arg_info *arg = &sub->args[i];
if (arg->arg_type == ARG_ANYTHING) {
if (reg->type != SCALAR_VALUE) {
- bpf_log(log, "R%d is not a scalar\n", regno);
+ bpf_log(log, "%s is not a scalar\n", reg_arg_name(env, argno));
return -EINVAL;
}
} else if (arg->arg_type & PTR_UNTRUSTED) {
@@ -9334,24 +9151,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
* invalid memory access.
*/
} else if (arg->arg_type == ARG_PTR_TO_CTX) {
- ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX);
+ ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_CTX);
if (ret < 0)
return ret;
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
if (reg->type != PTR_TO_CTX) {
- bpf_log(log, "arg#%d expects pointer to ctx\n", i);
+ bpf_log(log, "%s expects pointer to ctx\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
- ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ ret = check_func_arg_reg_off(env, reg, argno, ARG_DONTCARE);
if (ret < 0)
return ret;
- if (check_mem_reg(env, reg, regno, arg->mem_size))
+ if (check_mem_reg(env, reg, argno, arg->mem_size))
return -EINVAL;
if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
- bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
+ bpf_log(log, "%s is expected to be non-NULL\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
} else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
@@ -9363,15 +9182,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
* run-time debug nightmare.
*/
if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
- bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
+ bpf_log(log, "%s is not a pointer to arena or scalar.\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
- ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
+ } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) {
+ ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_DYNPTR);
if (ret)
return ret;
- ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
+ ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, &ref_obj, NULL);
if (ret)
return ret;
} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
@@ -9382,12 +9202,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
continue;
memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
- err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
- err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+ err = check_reg_type(env, reg, argno, arg->arg_type, &arg->btf_id, &meta);
+ err = err ?: check_func_arg_reg_off(env, reg, argno, arg->arg_type);
if (err)
return err;
} else {
- verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type);
+ verifier_bug(env, "unrecognized %s type %d",
+ reg_arg_name(env, argno), arg->arg_type);
return -EFAULT;
}
}
@@ -9499,10 +9320,15 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
return 0;
}
+static int process_bpf_exit_full(struct bpf_verifier_env *env,
+ bool *do_print_state, bool exception_exit);
+
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_subprog_info *caller_info;
+ u16 callee_incoming, stack_arg_cnt;
struct bpf_func_state *caller;
int err, subprog, target_insn;
@@ -9545,6 +9371,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* mark global subprog for verifying after main prog */
subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
+ invalidate_outgoing_stack_args(env, cur_func(env));
/* All non-void global functions return a 64-bit SCALAR_VALUE. */
if (!subprog_returns_void(env, subprog)) {
@@ -9552,10 +9379,31 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
}
+ if (env->subprog_info[subprog].might_throw) {
+ struct bpf_verifier_state *branch;
+
+ branch = push_stack(env, *insn_idx + 1, *insn_idx, false);
+ if (IS_ERR(branch)) {
+ verbose(env, "failed to push state for global subprog exception path\n");
+ return PTR_ERR(branch);
+ }
+ return process_bpf_exit_full(env, NULL, true);
+ }
+
/* continue with next insn after call */
return 0;
}
+ /*
+ * Track caller's total stack arg count (incoming + max outgoing).
+ * This is needed so the JIT knows how much stack arg space to allocate.
+ */
+ caller_info = &env->subprog_info[caller->subprogno];
+ callee_incoming = bpf_in_stack_arg_cnt(&env->subprog_info[subprog]);
+ stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + callee_incoming;
+ if (stack_arg_cnt > caller_info->stack_arg_cnt)
+ caller_info->stack_arg_cnt = stack_arg_cnt;
+
/* for regular function entry setup new frame and continue
* from that frame.
*/
@@ -9839,9 +9687,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
{
if (range.return_32bit)
- return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
+ return range.minval <= reg_s32_min(reg) && reg_s32_max(reg) <= range.maxval;
else
- return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+ return range.minval <= reg_smin(reg) && reg_smax(reg) <= range.maxval;
}
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
@@ -9913,6 +9761,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
* bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
state->frame[state->curframe--] = NULL;
+ invalidate_outgoing_stack_args(env, caller);
/* for callbacks widen imprecise scalars to make programs like below verify:
*
@@ -9939,7 +9788,9 @@ static int do_refine_retval_range(struct bpf_verifier_env *env,
int func_id,
struct bpf_call_arg_meta *meta)
{
+ struct bpf_retval_range range;
struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
if (ret_type != RET_INTEGER)
return 0;
@@ -9950,21 +9801,36 @@ static int do_refine_retval_range(struct bpf_verifier_env *env,
case BPF_FUNC_probe_read_str:
case BPF_FUNC_probe_read_kernel_str:
case BPF_FUNC_probe_read_user_str:
- ret_reg->smax_value = meta->msize_max_value;
- ret_reg->s32_max_value = meta->msize_max_value;
- ret_reg->smin_value = -MAX_ERRNO;
- ret_reg->s32_min_value = -MAX_ERRNO;
+ reg_set_srange64(ret_reg, -MAX_ERRNO, meta->msize_max_value);
+ reg_set_srange32(ret_reg, -MAX_ERRNO, meta->msize_max_value);
reg_bounds_sync(ret_reg);
break;
case BPF_FUNC_get_smp_processor_id:
- ret_reg->umax_value = nr_cpu_ids - 1;
- ret_reg->u32_max_value = nr_cpu_ids - 1;
- ret_reg->smax_value = nr_cpu_ids - 1;
- ret_reg->s32_max_value = nr_cpu_ids - 1;
- ret_reg->umin_value = 0;
- ret_reg->u32_min_value = 0;
- ret_reg->smin_value = 0;
- ret_reg->s32_min_value = 0;
+ reg_set_urange64(ret_reg, 0, nr_cpu_ids - 1);
+ reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1);
+ reg_bounds_sync(ret_reg);
+ break;
+ case BPF_FUNC_get_retval:
+ /*
+ * bpf_get_retval may see arbitrary value passed by bpf_prog_run_array_cg for
+ * CGROUP_GETSOCKOPT type.
+ */
+ if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT &&
+ env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT)
+ break;
+
+ if (prog_type == BPF_PROG_TYPE_LSM &&
+ env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+ if (!env->prog->aux->attach_func_proto->type)
+ break;
+ bpf_lsm_get_retval_range(env->prog, &range);
+ } else {
+ range.minval = -MAX_ERRNO;
+ range.maxval = 0;
+ }
+
+ reg_set_srange64(ret_reg, range.minval, range.maxval);
+ reg_set_srange32(ret_reg, range.minval, range.maxval);
reg_bounds_sync(ret_reg);
break;
}
@@ -10073,7 +9939,7 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
* kernel. Type checks are performed later in check_return_code.
*/
if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit &&
- reg->ref_obj_id == state->refs[i].id)
+ reg->id == state->refs[i].id)
continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
@@ -10208,13 +10074,16 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
state->callback_subprogno == subprogno);
}
-/* Returns whether or not the given map type can potentially elide
+/* Returns whether or not the given map can potentially elide
* lookup return value nullness check. This is possible if the key
* is statically known.
*/
-static bool can_elide_value_nullness(enum bpf_map_type type)
+static bool can_elide_value_nullness(const struct bpf_map *map)
{
- switch (type) {
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return false;
+
+ switch (map->map_type) {
case BPF_MAP_TYPE_ARRAY:
case BPF_MAP_TYPE_PERCPU_ARRAY:
return true;
@@ -10259,6 +10128,24 @@ static const char *non_sleepable_context_description(struct bpf_verifier_env *en
return "non-sleepable prog";
}
+static int release_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ bool convert_rcu, bool release_dynptr)
+{
+ int err = -EINVAL;
+
+ if (bpf_register_is_null(reg))
+ return 0;
+
+ if (release_dynptr)
+ err = unmark_stack_slots_dynptr(env, reg);
+ else if (convert_rcu)
+ err = ref_convert_alloc_rcu_protected(env, reg->id);
+ else if (reg_is_referenced(env, reg))
+ err = release_reference(env, reg->id);
+
+ return err;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -10308,7 +10195,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn);
+ err = check_func_proto(fn, &meta);
if (err) {
verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id);
return err;
@@ -10340,55 +10227,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (err)
return err;
+ regs = cur_regs(env);
+
/* Mark slots with STACK_MISC in case of raw mode, stack offset
* is inferred from register state.
*/
for (i = 0; i < meta.access_size; i++) {
- err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
+ err = check_mem_access(env, insn_idx, regs + meta.regno, argno_from_reg(meta.regno), i, BPF_B,
BPF_WRITE, -1, false, false);
if (err)
return err;
}
- regs = cur_regs(env);
-
if (meta.release_regno) {
- err = -EINVAL;
- if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
- err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
- } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
- u32 ref_obj_id = meta.ref_obj_id;
- bool in_rcu = in_rcu_cs(env);
- struct bpf_func_state *state;
- struct bpf_reg_state *reg;
-
- err = release_reference_nomark(env->cur_state, ref_obj_id);
- if (!err) {
- bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
- if (reg->ref_obj_id == ref_obj_id) {
- if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
- reg->ref_obj_id = 0;
- reg->type &= ~MEM_ALLOC;
- reg->type |= MEM_RCU;
- } else {
- mark_reg_invalid(env, reg);
- }
- }
- }));
- }
- } else if (meta.ref_obj_id) {
- err = release_reference(env, meta.ref_obj_id);
- } else if (bpf_register_is_null(&regs[meta.release_regno])) {
- /* meta.ref_obj_id can only be 0 if register that is meant to be
- * released is NULL, which must be > R0.
- */
- err = 0;
- }
- if (err) {
- verbose(env, "func %s#%d reference has not been acquired before\n",
- func_id_name(func_id), func_id);
+ struct bpf_reg_state *reg = &regs[meta.release_regno];
+ bool convert_rcu = (func_id == BPF_FUNC_kptr_xchg) && in_rcu_cs(env) &&
+ (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU);
+
+ err = release_reg(env, reg, convert_rcu, !!meta.dynptr.id);
+ if (err)
return err;
- }
}
switch (func_id) {
@@ -10429,7 +10287,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
err = mark_chain_precision(env, BPF_REG_1);
if (err)
return err;
- if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+ if (cur_func(env)->callback_depth < reg_umax(&regs[BPF_REG_1])) {
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_loop_callback_state);
} else {
@@ -10447,6 +10305,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
break;
case BPF_FUNC_set_retval:
+ {
+ struct bpf_retval_range range = {
+ .minval = -MAX_ERRNO,
+ .maxval = 0,
+ .return_32bit = true
+ };
+ struct bpf_reg_state *r1 = &regs[BPF_REG_1];
+
+ if (r1->type != SCALAR_VALUE) {
+ verbose(env, "R1 is not a scalar\n");
+ return -EINVAL;
+ }
+
+ /* CGROUP_GETSOCKOPT is allowed to return arbitrary value */
+ if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT &&
+ env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT)
+ break;
+
if (prog_type == BPF_PROG_TYPE_LSM &&
env->prog->expected_attach_type == BPF_LSM_CGROUP) {
if (!env->prog->aux->attach_func_proto->type) {
@@ -10456,54 +10332,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
return -EINVAL;
}
- }
- break;
- case BPF_FUNC_dynptr_data:
- {
- struct bpf_reg_state *reg;
- int id, ref_obj_id;
-
- reg = get_dynptr_arg_reg(env, fn, regs);
- if (!reg)
- return -EFAULT;
-
-
- if (meta.dynptr_id) {
- verifier_bug(env, "meta.dynptr_id already set");
- return -EFAULT;
- }
- if (meta.ref_obj_id) {
- verifier_bug(env, "meta.ref_obj_id already set");
- return -EFAULT;
+ bpf_lsm_get_retval_range(env->prog, &range);
}
- id = dynptr_id(env, reg);
- if (id < 0) {
- verifier_bug(env, "failed to obtain dynptr id");
- return id;
- }
+ err = mark_chain_precision(env, BPF_REG_1);
+ if (err)
+ return err;
- ref_obj_id = dynptr_ref_obj_id(env, reg);
- if (ref_obj_id < 0) {
- verifier_bug(env, "failed to obtain dynptr ref_obj_id");
- return ref_obj_id;
+ if (!retval_range_within(range, r1)) {
+ verbose_invalid_scalar(env, r1, range, "At bpf_set_retval", "R1");
+ return -EINVAL;
}
- meta.dynptr_id = id;
- meta.ref_obj_id = ref_obj_id;
-
break;
}
case BPF_FUNC_dynptr_write:
{
- enum bpf_dynptr_type dynptr_type;
- struct bpf_reg_state *reg;
+ enum bpf_dynptr_type dynptr_type = meta.dynptr.type;
- reg = get_dynptr_arg_reg(env, fn, regs);
- if (!reg)
- return -EFAULT;
-
- dynptr_type = dynptr_get_type(env, reg);
if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
return -EFAULT;
@@ -10547,6 +10393,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
+ invalidate_outgoing_stack_args(env, cur_func(env));
/* helper call returns 64-bit value. */
regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
@@ -10576,7 +10423,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
if (func_id == BPF_FUNC_map_lookup_elem &&
- can_elide_value_nullness(meta.map.ptr->map_type) &&
+ can_elide_value_nullness(meta.map.ptr) &&
meta.const_map_key >= 0 &&
meta.const_map_key < meta.map.ptr->max_entries)
ret_flag &= ~PTR_MAYBE_NULL;
@@ -10688,29 +10535,45 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
- if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) {
- verifier_bug(env, "func %s#%d sets ref_obj_id more than once",
- func_id_name(func_id), func_id);
- return -EFAULT;
- }
+ if (is_ptr_cast_function(func_id) &&
+ find_reference_state(env->cur_state, meta.ref_obj.id)) {
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *r0;
- if (is_dynptr_ref_function(func_id))
- regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+ err = validate_ref_obj(env, &meta.ref_obj);
+ if (err)
+ return err;
- if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
- /* For release_reference() */
- regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ /*
+ * In order for a release of any of the original or cast pointers
+ * to invalidate all other pointers, reuse the same reference id for
+ * the cast result.
+ * This reference id can't be used for nullness propagation,
+ * as cast might return NULL for a non-NULL input.
+ * Hence, explore the NULL case as a separate branch.
+ */
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+
+ r0 = &branch->frame[branch->curframe]->regs[BPF_REG_0];
+ __mark_reg_known_zero(r0);
+ r0->type = SCALAR_VALUE;
+
+ regs[BPF_REG_0].type &= ~PTR_MAYBE_NULL;
+ regs[BPF_REG_0].id = meta.ref_obj.id;
} else if (is_acquire_function(func_id, meta.map.ptr)) {
- int id = acquire_reference(env, insn_idx);
+ int id = acquire_reference(env, insn_idx, 0);
if (id < 0)
return id;
- /* For mark_ptr_or_null_reg() */
+
regs[BPF_REG_0].id = id;
- /* For release_reference() */
- regs[BPF_REG_0].ref_obj_id = id;
}
+ if (func_id == BPF_FUNC_dynptr_data)
+ regs[BPF_REG_0].parent_id = meta.dynptr.id;
+
err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
if (err)
return err;
@@ -10806,7 +10669,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RELEASE;
}
-
static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_DESTRUCTIVE;
@@ -10883,6 +10745,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param
return btf_param_match_suffix(btf, arg, "__nullable");
}
+static bool is_kfunc_arg_nonown_allowed(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__nonown_allowed");
+}
+
static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
{
return btf_param_match_suffix(btf, arg, "__str");
@@ -11123,10 +10990,15 @@ enum special_kfunc_type {
KF_bpf_list_push_front,
KF_bpf_list_push_back_impl,
KF_bpf_list_push_back,
+ KF_bpf_list_add,
KF_bpf_list_pop_front,
KF_bpf_list_pop_back,
+ KF_bpf_list_del,
KF_bpf_list_front,
KF_bpf_list_back,
+ KF_bpf_list_is_first,
+ KF_bpf_list_is_last,
+ KF_bpf_list_empty,
KF_bpf_cast_to_kern_ctx,
KF_bpf_rdonly_cast,
KF_bpf_rcu_read_lock,
@@ -11191,10 +11063,15 @@ BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_front)
BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_add)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_del)
BTF_ID(func, bpf_list_front)
BTF_ID(func, bpf_list_back)
+BTF_ID(func, bpf_list_is_first)
+BTF_ID(func, bpf_list_is_last)
+BTF_ID(func, bpf_list_empty)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
@@ -11263,7 +11140,11 @@ BTF_ID(func, bpf_task_work_schedule_resume)
BTF_ID(func, bpf_arena_alloc_pages)
BTF_ID(func, bpf_arena_free_pages)
BTF_ID(func, bpf_arena_reserve_pages)
+#ifdef CONFIG_BPF_EVENTS
BTF_ID(func, bpf_session_is_return)
+#else
+BTF_ID_UNUSED
+#endif
BTF_ID(func, bpf_stream_vprintk)
BTF_ID(func, bpf_stream_print_stack)
@@ -11302,7 +11183,8 @@ static bool is_bpf_list_push_kfunc(u32 func_id)
return func_id == special_kfunc_list[KF_bpf_list_push_front] ||
func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
func_id == special_kfunc_list[KF_bpf_list_push_back] ||
- func_id == special_kfunc_list[KF_bpf_list_push_back_impl];
+ func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ func_id == special_kfunc_list[KF_bpf_list_add];
}
static bool is_bpf_rbtree_add_kfunc(u32 func_id)
@@ -11351,15 +11233,12 @@ bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
}
static enum kfunc_ptr_arg_type
-get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
- struct bpf_kfunc_call_arg_meta *meta,
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_func_state *caller,
+ struct bpf_reg_state *regs, struct bpf_kfunc_call_arg_meta *meta,
const struct btf_type *t, const struct btf_type *ref_t,
const char *ref_tname, const struct btf_param *args,
- int argno, int nargs)
+ int arg, int nargs, argno_t argno, struct bpf_reg_state *reg)
{
- u32 regno = argno + 1;
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[regno];
bool arg_mem_size = false;
if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
@@ -11367,9 +11246,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
meta->func_id == special_kfunc_list[KF_bpf_session_cookie])
return KF_ARG_PTR_TO_CTX;
- if (argno + 1 < nargs &&
- (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
- is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
+ if (arg + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1))))
arg_mem_size = true;
/* In this function, we verify the kfunc's BTF as per the argument type,
@@ -11377,68 +11256,69 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
* type to our caller. When a set of conditions hold in the BTF type of
* arguments, we resolve it to a known kfunc_ptr_arg_type.
*/
- if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), arg))
return KF_ARG_PTR_TO_CTX;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) &&
+ if (is_kfunc_arg_nullable(meta->btf, &args[arg]) && bpf_register_is_null(reg) &&
!arg_mem_size)
return KF_ARG_PTR_TO_NULL;
- if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+ if (is_kfunc_arg_alloc_obj(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_ALLOC_BTF_ID;
- if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
+ if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
- if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+ if (is_kfunc_arg_dynptr(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_DYNPTR;
- if (is_kfunc_arg_iter(meta, argno, &args[argno]))
+ if (is_kfunc_arg_iter(meta, arg, &args[arg]))
return KF_ARG_PTR_TO_ITER;
- if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+ if (is_kfunc_arg_list_head(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_LIST_HEAD;
- if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+ if (is_kfunc_arg_list_node(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_LIST_NODE;
- if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
+ if (is_kfunc_arg_rbtree_root(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_RB_ROOT;
- if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
+ if (is_kfunc_arg_rbtree_node(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_RB_NODE;
- if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
+ if (is_kfunc_arg_const_str(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_CONST_STR;
- if (is_kfunc_arg_map(meta->btf, &args[argno]))
+ if (is_kfunc_arg_map(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_MAP;
- if (is_kfunc_arg_wq(meta->btf, &args[argno]))
+ if (is_kfunc_arg_wq(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_WORKQUEUE;
- if (is_kfunc_arg_timer(meta->btf, &args[argno]))
+ if (is_kfunc_arg_timer(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_TIMER;
- if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
+ if (is_kfunc_arg_task_work(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_TASK_WORK;
- if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
+ if (is_kfunc_arg_irq_flag(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_IRQ_FLAG;
- if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno]))
+ if (is_kfunc_arg_res_spin_lock(meta->btf, &args[arg]))
return KF_ARG_PTR_TO_RES_SPIN_LOCK;
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
- verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
- meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ verbose(env, "kernel function %s %s pointer type %s %s is not supported\n",
+ meta->func_name, reg_arg_name(env, argno),
+ btf_type_str(ref_t), ref_tname);
return -EINVAL;
}
return KF_ARG_PTR_TO_BTF_ID;
}
- if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
+ if (is_kfunc_arg_callback(env, meta->btf, &args[arg]))
return KF_ARG_PTR_TO_CALLBACK;
/* This is the catch all argument type of register types supported by
@@ -11448,8 +11328,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
*/
if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
(arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
- verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
- argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ verbose(env, "%s pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ reg_arg_name(env, argno),
+ btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
return -EINVAL;
}
return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
@@ -11460,7 +11341,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
const struct btf_type *ref_t,
const char *ref_tname, u32 ref_id,
struct bpf_kfunc_call_arg_meta *meta,
- int argno)
+ int arg, argno_t argno)
{
const struct btf_type *reg_ref_t;
bool strict_type_match = false;
@@ -11502,7 +11383,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
* btf_struct_ids_match() to walk the struct at the 0th offset, and
* resolve types.
*/
- if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
+ if ((is_kfunc_release(meta) && reg_is_referenced(env, reg)) ||
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
@@ -11518,19 +11399,19 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
*/
taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
if (!taking_projection && !struct_same) {
- verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
- meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+ verbose(env, "kernel function %s %s expected pointer to %s %s but %s has a pointer to %s %s\n",
+ meta->func_name, reg_arg_name(env, argno),
+ btf_type_str(ref_t), ref_tname, reg_arg_name(env, argno),
btf_type_str(reg_ref_t), reg_ref_tname);
return -EINVAL;
}
return 0;
}
-static int process_irq_flag(struct bpf_verifier_env *env, int regno,
+static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_reg_state *reg = reg_state(env, regno);
- int err, kfunc_class = IRQ_NATIVE_KFUNC;
+ int err, spi, kfunc_class = IRQ_NATIVE_KFUNC;
bool irq_save;
if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] ||
@@ -11550,11 +11431,13 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno,
if (irq_save) {
if (!is_irq_flag_reg_valid_uninit(env, reg)) {
- verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1);
+ verbose(env, "expected uninitialized irq flag as %s\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
+ err = check_mem_access(env, env->insn_idx, reg, argno, 0, BPF_DW,
+ BPF_WRITE, -1, false, false);
if (err)
return err;
@@ -11564,13 +11447,16 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno,
} else {
err = is_irq_flag_reg_valid_init(env, reg);
if (err) {
- verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1);
+ verbose(env, "expected an initialized irq flag as %s\n",
+ reg_arg_name(env, argno));
return err;
}
- err = mark_irq_flag_read(env, reg);
- if (err)
- return err;
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ mark_stack_slots_scratched(env, spi, 1);
err = unmark_stack_slot_irq_flag(env, reg, kfunc_class);
if (err)
@@ -11601,36 +11487,21 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
return 0;
}
-static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+static void ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 id)
{
- struct bpf_verifier_state *state = env->cur_state;
struct bpf_func_state *unused;
struct bpf_reg_state *reg;
- int i;
- if (!ref_obj_id) {
- verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion");
- return -EFAULT;
- }
+ WARN_ON_ONCE(release_reference_nomark(env->cur_state, id));
- for (i = 0; i < state->acquired_refs; i++) {
- if (state->refs[i].id != ref_obj_id)
- continue;
-
- /* Clear ref_obj_id here so release_reference doesn't clobber
- * the whole reg
- */
- bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
- if (reg->ref_obj_id == ref_obj_id) {
- reg->ref_obj_id = 0;
- ref_set_non_owning(env, reg);
- }
- }));
- return 0;
- }
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (reg->id == id) {
+ reg->id = 0;
+ ref_set_non_owning(env, reg);
+ }
+ }));
- verifier_bug(env, "ref state missing for ref_obj_id");
- return -EFAULT;
+ return;
}
/* Implementation details:
@@ -11711,8 +11582,12 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
return is_bpf_list_push_kfunc(btf_id) ||
btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_del] ||
btf_id == special_kfunc_list[KF_bpf_list_front] ||
- btf_id == special_kfunc_list[KF_bpf_list_back];
+ btf_id == special_kfunc_list[KF_bpf_list_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_is_first] ||
+ btf_id == special_kfunc_list[KF_bpf_list_is_last] ||
+ btf_id == special_kfunc_list[KF_bpf_list_empty];
}
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
@@ -11778,7 +11653,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id)
is_task_work_add_kfunc(btf_id);
}
-static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+bool bpf_is_throw_kfunc(struct bpf_insn *insn)
{
return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
insn->imm == special_kfunc_list[KF_bpf_throw];
@@ -11833,7 +11708,10 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
switch (node_field_type) {
case BPF_LIST_NODE:
- ret = is_bpf_list_push_kfunc(kfunc_btf_id);
+ ret = is_bpf_list_push_kfunc(kfunc_btf_id) ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_del] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_first] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_last];
break;
case BPF_RB_NODE:
ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) ||
@@ -11855,7 +11733,7 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
static int
__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno,
+ struct bpf_reg_state *reg, argno_t argno,
struct bpf_kfunc_call_arg_meta *meta,
enum btf_field_type head_field_type,
struct btf_field **head_field)
@@ -11876,8 +11754,8 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
head_type_name = btf_field_type_name(head_field_type);
if (!tnum_is_const(reg->var_off)) {
verbose(env,
- "R%d doesn't have constant offset. %s has to be at the constant offset\n",
- regno, head_type_name);
+ "%s doesn't have constant offset. %s has to be at the constant offset\n",
+ reg_arg_name(env, argno), head_type_name);
return -EINVAL;
}
@@ -11905,24 +11783,24 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
}
static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno,
+ struct bpf_reg_state *reg, argno_t argno,
struct bpf_kfunc_call_arg_meta *meta)
{
- return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
+ return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_LIST_HEAD,
&meta->arg_list_head.field);
}
static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno,
+ struct bpf_reg_state *reg, argno_t argno,
struct bpf_kfunc_call_arg_meta *meta)
{
- return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
+ return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_RB_ROOT,
&meta->arg_rbtree_root.field);
}
static int
__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno,
+ struct bpf_reg_state *reg, argno_t argno,
struct bpf_kfunc_call_arg_meta *meta,
enum btf_field_type head_field_type,
enum btf_field_type node_field_type,
@@ -11944,8 +11822,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
node_type_name = btf_field_type_name(node_field_type);
if (!tnum_is_const(reg->var_off)) {
verbose(env,
- "R%d doesn't have constant offset. %s has to be at the constant offset\n",
- regno, node_type_name);
+ "%s doesn't have constant offset. %s has to be at the constant offset\n",
+ reg_arg_name(env, argno), node_type_name);
return -EINVAL;
}
@@ -11986,19 +11864,19 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
}
static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno,
+ struct bpf_reg_state *reg, argno_t argno,
struct bpf_kfunc_call_arg_meta *meta)
{
- return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta,
BPF_LIST_HEAD, BPF_LIST_NODE,
&meta->arg_list_head.field);
}
static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno,
+ struct bpf_reg_state *reg, argno_t argno,
struct bpf_kfunc_call_arg_meta *meta)
{
- return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta,
BPF_RB_ROOT, BPF_RB_NODE,
&meta->arg_rbtree_root.field);
}
@@ -12029,6 +11907,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
int insn_idx)
{
const char *func_name = meta->func_name, *ref_tname;
+ struct bpf_func_state *caller = cur_func(env);
+ struct bpf_reg_state *regs = cur_regs(env);
const struct btf *btf = meta->btf;
const struct btf_param *args;
struct btf_record *rec;
@@ -12037,20 +11917,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
args = (const struct btf_param *)(meta->func_proto + 1);
nargs = btf_type_vlen(meta->func_proto);
- if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ if (nargs > MAX_BPF_FUNC_ARGS) {
verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
- MAX_BPF_FUNC_REG_ARGS);
+ MAX_BPF_FUNC_ARGS);
return -EINVAL;
}
+ if (nargs > MAX_BPF_FUNC_REG_ARGS && !bpf_jit_supports_stack_args()) {
+ verbose(env, "JIT does not support kfunc %s() with %d args\n",
+ func_name, nargs);
+ return -ENOTSUPP;
+ }
+
+ ret = check_outgoing_stack_args(env, caller, nargs);
+ if (ret)
+ return ret;
/* Check that BTF function arguments match actual types that the
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+ struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i);
const struct btf_type *t, *ref_t, *resolve_ret;
enum bpf_arg_type arg_type = ARG_DONTCARE;
- u32 regno = i + 1, ref_id, type_size;
+ argno_t argno = argno_from_arg(i + 1);
+ int regno = reg_from_argno(argno);
+ u32 ref_id, type_size;
bool is_ret_buf_sz = false;
int kf_arg_type;
@@ -12060,6 +11951,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc");
return -EFAULT;
}
+ if (regno < 0) {
+ verbose(env, "%s prog->aux cannot be a stack argument\n",
+ reg_arg_name(env, argno));
+ return -EINVAL;
+ }
meta->arg_prog = true;
cur_aux(env)->arg_prog = regno;
continue;
@@ -12072,7 +11968,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (btf_type_is_scalar(t)) {
if (reg->type != SCALAR_VALUE) {
- verbose(env, "R%d is not a scalar\n", regno);
+ verbose(env, "%s is not a scalar\n", reg_arg_name(env, argno));
return -EINVAL;
}
@@ -12082,10 +11978,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EFAULT;
}
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "R%d must be a known constant\n", regno);
+ verbose(env, "%s must be a known constant\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- ret = mark_chain_precision(env, regno);
+ if (regno >= 0)
+ ret = mark_chain_precision(env, regno);
+ else
+ ret = mark_stack_arg_precision(env, i);
if (ret < 0)
return ret;
meta->arg_constant.found = true;
@@ -12104,12 +12004,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "R%d is not a const\n", regno);
+ verbose(env, "%s is not a const\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
meta->r0_size = reg->var_off.value;
- ret = mark_chain_precision(env, regno);
+ if (regno >= 0)
+ ret = mark_chain_precision(env, regno);
+ else
+ ret = mark_stack_arg_precision(env, i);
if (ret)
return ret;
}
@@ -12117,32 +12021,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
if (!btf_type_is_ptr(t)) {
- verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+ verbose(env, "Unrecognized %s type %s\n",
+ reg_arg_name(env, argno), btf_type_str(t));
return -EINVAL;
}
if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) &&
!is_kfunc_arg_nullable(meta->btf, &args[i])) {
- verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ verbose(env, "Possibly NULL pointer passed to trusted %s\n",
+ reg_arg_name(env, argno));
return -EACCES;
}
- if (reg->ref_obj_id) {
- if (is_kfunc_release(meta) && meta->ref_obj_id) {
- verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u",
- regno, reg->ref_obj_id,
- meta->ref_obj_id);
- return -EFAULT;
- }
- meta->ref_obj_id = reg->ref_obj_id;
- if (is_kfunc_release(meta))
- meta->release_regno = regno;
+ if (regno == meta->release_regno && !is_kfunc_arg_dynptr(meta->btf, &args[i]) &&
+ !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) {
+ verbose(env, "release kfunc %s expects referenced PTR_TO_BTF_ID passed to %s\n",
+ func_name, reg_arg_name(env, argno));
+ return -EINVAL;
}
+ if (reg_is_referenced(env, reg))
+ update_ref_obj(&meta->ref_obj, reg);
+
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+ kf_arg_type = get_kfunc_ptr_arg_type(env, caller, regs, meta, t, ref_t, ref_tname,
+ args, i, nargs, argno, reg);
if (kf_arg_type < 0)
return kf_arg_type;
@@ -12151,7 +12056,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
continue;
case KF_ARG_PTR_TO_MAP:
if (!reg->map_ptr) {
- verbose(env, "pointer in R%d isn't map pointer\n", regno);
+ verbose(env, "pointer in %s isn't map pointer\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 ||
@@ -12187,18 +12093,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
fallthrough;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
- if (!is_trusted_reg(reg)) {
+ if (!is_trusted_reg(env, reg)) {
if (!is_kfunc_rcu(meta)) {
- verbose(env, "R%d must be referenced or trusted\n", regno);
+ verbose(env, "%s must be referenced or trusted\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
if (!is_rcu_reg(reg)) {
- verbose(env, "R%d must be a rcu pointer\n", regno);
+ verbose(env, "%s must be a rcu pointer\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
}
fallthrough;
- case KF_ARG_PTR_TO_DYNPTR:
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
@@ -12215,6 +12122,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_IRQ_FLAG:
case KF_ARG_PTR_TO_RES_SPIN_LOCK:
break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ arg_type = ARG_PTR_TO_DYNPTR;
+ break;
case KF_ARG_PTR_TO_CTX:
arg_type = ARG_PTR_TO_CTX;
break;
@@ -12223,17 +12133,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EFAULT;
}
- if (is_kfunc_release(meta) && reg->ref_obj_id)
+ if (regno == meta->release_regno)
arg_type |= OBJ_RELEASE;
- ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ ret = check_func_arg_reg_off(env, reg, argno, arg_type);
if (ret < 0)
return ret;
switch (kf_arg_type) {
case KF_ARG_PTR_TO_CTX:
if (reg->type != PTR_TO_CTX) {
- verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
- i, reg_type_str(env, reg->type));
+ verbose(env, "%s expected pointer to ctx, but got %s\n",
+ reg_arg_name(env, argno), reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -12247,19 +12157,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
if (!is_bpf_obj_drop_kfunc(meta->func_id)) {
- verbose(env, "arg#%d expected for bpf_obj_drop()\n", i);
+ verbose(env, "%s expected for bpf_obj_drop()\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
} else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) {
- verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i);
+ verbose(env, "%s expected for bpf_percpu_obj_drop()\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
} else {
- verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ verbose(env, "%s expected pointer to allocated object\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- if (!reg->ref_obj_id) {
+ if (!reg_is_referenced(env, reg)) {
verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
@@ -12271,10 +12184,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_DYNPTR:
{
enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
- int clone_ref_obj_id = 0;
-
- if (reg->type == CONST_PTR_TO_DYNPTR)
- dynptr_arg_type |= MEM_RDONLY;
if (is_kfunc_arg_uninit(btf, &args[i]))
dynptr_arg_type |= MEM_UNINIT;
@@ -12288,11 +12197,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
dynptr_arg_type |= DYNPTR_TYPE_FILE;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) {
- dynptr_arg_type |= DYNPTR_TYPE_FILE;
- meta->release_regno = regno;
+ dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
(dynptr_arg_type & MEM_UNINIT)) {
- enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
+ enum bpf_dynptr_type parent_type = meta->dynptr.type;
if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
verifier_bug(env, "no dynptr type for parent of clone");
@@ -12300,29 +12208,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
- clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
- if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
- verifier_bug(env, "missing ref obj id for parent of clone");
- return -EFAULT;
- }
}
- ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
+ ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type,
+ &meta->ref_obj, &meta->dynptr);
if (ret < 0)
return ret;
-
- if (!(dynptr_arg_type & MEM_UNINIT)) {
- int id = dynptr_id(env, reg);
-
- if (id < 0) {
- verifier_bug(env, "failed to obtain dynptr id");
- return id;
- }
- meta->initialized_dynptr.id = id;
- meta->initialized_dynptr.type = dynptr_get_type(env, reg);
- meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
- }
-
break;
}
case KF_ARG_PTR_TO_ITER:
@@ -12332,63 +12223,78 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
}
- ret = process_iter_arg(env, regno, insn_idx, meta);
+ ret = process_iter_arg(env, reg, argno, insn_idx, meta);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_LIST_HEAD:
if (reg->type != PTR_TO_MAP_VALUE &&
reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
- verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ verbose(env, "%s expected pointer to map value or allocated object\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) &&
+ !reg_is_referenced(env, reg)) {
verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
- ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+ ret = process_kf_arg_ptr_to_list_head(env, reg, argno, meta);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_RB_ROOT:
if (reg->type != PTR_TO_MAP_VALUE &&
reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
- verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ verbose(env, "%s expected pointer to map value or allocated object\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) &&
+ !reg_is_referenced(env, reg)) {
verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
- ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
+ ret = process_kf_arg_ptr_to_rbtree_root(env, reg, argno, meta);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_LIST_NODE:
+ if (is_kfunc_arg_nonown_allowed(btf, &args[i]) &&
+ type_is_non_owning_ref(reg->type) && !reg_is_referenced(env, reg)) {
+ /* Allow bpf_list_front/back return value for
+ * __nonown_allowed list-node arguments.
+ */
+ goto check_ok;
+ }
if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
- verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ verbose(env, "%s expected pointer to allocated object\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- if (!reg->ref_obj_id) {
+ if (!reg_is_referenced(env, reg)) {
verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
- ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+check_ok:
+ ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_RB_NODE:
if (is_bpf_rbtree_add_kfunc(meta->func_id)) {
if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
- verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ verbose(env, "%s expected pointer to allocated object\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- if (!reg->ref_obj_id) {
+ if (!reg_is_referenced(env, reg)) {
verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
} else {
- if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
+ if (!type_is_non_owning_ref(reg->type) &&
+ !reg_is_referenced(env, reg)) {
verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name);
return -EINVAL;
}
@@ -12398,7 +12304,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
}
- ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
+ ret = process_kf_arg_ptr_to_rbtree_node(env, reg, argno, meta);
if (ret < 0)
return ret;
break;
@@ -12413,38 +12319,44 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if ((base_type(reg->type) != PTR_TO_BTF_ID ||
(bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
!reg2btf_ids[base_type(reg->type)]) {
- verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+ verbose(env, "%s is %s ", reg_arg_name(env, argno),
+ reg_type_str(env, reg->type));
verbose(env, "expected %s or socket\n",
reg_type_str(env, base_type(reg->type) |
(type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
return -EINVAL;
}
- ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i, argno);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_MEM:
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
- verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+ verbose(env, "%s reference type('%s %s') size cannot be determined: %ld\n",
+ reg_arg_name(env, argno), btf_type_str(ref_t),
+ ref_tname, PTR_ERR(resolve_ret));
return -EINVAL;
}
- ret = check_mem_reg(env, reg, regno, type_size);
+ ret = check_mem_reg(env, reg, argno, type_size);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_MEM_SIZE:
{
- struct bpf_reg_state *buff_reg = &regs[regno];
+ struct bpf_reg_state *buff_reg = reg;
const struct btf_param *buff_arg = &args[i];
- struct bpf_reg_state *size_reg = &regs[regno + 1];
+ struct bpf_reg_state *size_reg = get_func_arg_reg(caller, regs, i + 1);
const struct btf_param *size_arg = &args[i + 1];
+ argno_t next_argno = argno_from_arg(i + 2);
if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
- ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+ ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg,
+ argno, next_argno);
if (ret < 0) {
- verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ verbose(env, "%s and ", reg_arg_name(env, argno));
+ verbose(env, "%s memory, len pair leads to invalid memory access\n",
+ reg_arg_name(env, next_argno));
return ret;
}
}
@@ -12455,7 +12367,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EFAULT;
}
if (!tnum_is_const(size_reg->var_off)) {
- verbose(env, "R%d must be a known constant\n", regno + 1);
+ verbose(env, "%s must be a known constant\n",
+ reg_arg_name(env, next_argno));
return -EINVAL;
}
meta->arg_constant.found = true;
@@ -12468,14 +12381,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
case KF_ARG_PTR_TO_CALLBACK:
if (reg->type != PTR_TO_FUNC) {
- verbose(env, "arg%d expected pointer to func\n", i);
+ verbose(env, "%s expected pointer to func\n", reg_arg_name(env, argno));
return -EINVAL;
}
meta->subprogno = reg->subprogno;
break;
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
if (!type_is_ptr_alloc_obj(reg->type)) {
- verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
+ verbose(env, "%s is neither owning or non-owning ref\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
if (!type_is_non_owning_ref(reg->type))
@@ -12488,7 +12402,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
if (rec->refcount_off < 0) {
- verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
+ verbose(env, "%s doesn't point to a type with bpf_refcount field\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
@@ -12497,46 +12412,51 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
case KF_ARG_PTR_TO_CONST_STR:
if (reg->type != PTR_TO_MAP_VALUE) {
- verbose(env, "arg#%d doesn't point to a const string\n", i);
+ verbose(env, "%s doesn't point to a const string\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- ret = check_reg_const_str(env, reg, regno);
+ ret = check_arg_const_str(env, reg, argno);
if (ret)
return ret;
break;
case KF_ARG_PTR_TO_WORKQUEUE:
if (reg->type != PTR_TO_MAP_VALUE) {
- verbose(env, "arg#%d doesn't point to a map value\n", i);
+ verbose(env, "%s doesn't point to a map value\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map);
+ ret = check_map_field_pointer(env, reg, argno, BPF_WORKQUEUE, &meta->map);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_TIMER:
if (reg->type != PTR_TO_MAP_VALUE) {
- verbose(env, "arg#%d doesn't point to a map value\n", i);
+ verbose(env, "%s doesn't point to a map value\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- ret = process_timer_kfunc(env, regno, meta);
+ ret = process_timer_kfunc(env, reg, argno, meta);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_TASK_WORK:
if (reg->type != PTR_TO_MAP_VALUE) {
- verbose(env, "arg#%d doesn't point to a map value\n", i);
+ verbose(env, "%s doesn't point to a map value\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map);
+ ret = check_map_field_pointer(env, reg, argno, BPF_TASK_WORK, &meta->map);
if (ret < 0)
return ret;
break;
case KF_ARG_PTR_TO_IRQ_FLAG:
if (reg->type != PTR_TO_STACK) {
- verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
+ verbose(env, "%s doesn't point to an irq flag on stack\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
- ret = process_irq_flag(env, regno, meta);
+ ret = process_irq_flag(env, reg, argno, meta);
if (ret < 0)
return ret;
break;
@@ -12545,7 +12465,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
int flags = PROCESS_RES_LOCK;
if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
- verbose(env, "arg#%d doesn't point to map value or allocated object\n", i);
+ verbose(env, "%s doesn't point to map value or allocated object\n",
+ reg_arg_name(env, argno));
return -EINVAL;
}
@@ -12557,7 +12478,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
flags |= PROCESS_LOCK_IRQ;
- ret = process_spin_lock(env, regno, flags);
+ ret = process_spin_lock(env, reg, argno, flags);
if (ret < 0)
return ret;
break;
@@ -12565,12 +12486,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
}
- if (is_kfunc_release(meta) && !meta->release_regno) {
- verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
- func_name);
- return -EINVAL;
- }
-
return 0;
}
@@ -12597,6 +12512,10 @@ int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
meta->kfunc_flags = *kfunc.flags;
+ /* Only support release referenced argument passed by register */
+ if (is_kfunc_release(meta))
+ meta->release_regno = BPF_REG_1;
+
return 0;
}
@@ -12926,7 +12845,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
}
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
- enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type);
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->dynptr.type);
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -12950,16 +12869,11 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
}
}
- if (!meta->initialized_dynptr.id) {
+ if (!meta->dynptr.id) {
verifier_bug(env, "no dynptr id");
return -EFAULT;
}
- regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;
-
- /* we don't need to set BPF_REG_0's ref obj id
- * because packet slices are not refcounted (see
- * dynptr_type_refcounted)
- */
+ regs[BPF_REG_0].parent_id = meta->dynptr.id;
} else {
return 0;
}
@@ -12968,14 +12882,12 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
}
static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
-static int process_bpf_exit_full(struct bpf_verifier_env *env,
- bool *do_print_state, bool exception_exit);
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
- u32 i, nargs, ptr_type_id, release_ref_obj_id;
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
const struct btf_type *t, *ptr_type;
@@ -12983,7 +12895,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn_aux_data *insn_aux;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
+ u32 i, nargs, ptr_type_id;
struct btf *desc_btf;
+ int id;
/* skip for now, but return error when we find this in fixup_kfunc_call */
if (!insn->imm)
@@ -13050,6 +12964,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (err < 0)
return err;
+ if ((is_bpf_obj_drop_kfunc(meta.func_id) ||
+ is_bpf_percpu_obj_drop_kfunc(meta.func_id)) && (is_tracing_prog_type(prog_type) ||
+ /* is_tracing_prog_type() for now doesn't cover non-iterator tracing progs. */
+ (prog_type == BPF_PROG_TYPE_TRACING && env->prog->expected_attach_type != BPF_TRACE_ITER
+ && !env->prog->sleepable))) {
+ struct btf_struct_meta *struct_meta;
+
+ struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
+ if (struct_meta && btf_record_has_nmi_unsafe_fields(struct_meta->record)) {
+ verbose(env, "%s cannot be used in tracing programs on types with NMI unsafe fields\n",
+ func_name);
+ return -EINVAL;
+ }
+ }
+
if (is_bpf_rbtree_add_kfunc(meta.func_id)) {
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_rbtree_add_callback_state);
@@ -13094,22 +13023,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (rcu_lock) {
env->cur_state->active_rcu_locks++;
} else if (rcu_unlock) {
- struct bpf_func_state *state;
- struct bpf_reg_state *reg;
- u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
-
if (env->cur_state->active_rcu_locks == 0) {
verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
return -EINVAL;
}
- if (--env->cur_state->active_rcu_locks == 0) {
- bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
- if (reg->type & MEM_RCU) {
- reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
- reg->type |= PTR_UNTRUSTED;
- }
- }));
- }
+ if (--env->cur_state->active_rcu_locks == 0)
+ invalidate_rcu_protected_refs(env);
} else if (preempt_disable) {
env->cur_state->active_preempt_locks++;
} else if (preempt_enable) {
@@ -13140,37 +13059,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
if (meta.release_regno) {
- struct bpf_reg_state *reg = &regs[meta.release_regno];
-
- if (meta.initialized_dynptr.ref_obj_id) {
- err = unmark_stack_slots_dynptr(env, reg);
- } else {
- err = release_reference(env, reg->ref_obj_id);
- if (err)
- verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, meta.func_id);
- }
+ err = release_reg(env, &regs[meta.release_regno], false, !!meta.dynptr.id);
if (err)
return err;
}
if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) {
- release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ id = regs[BPF_REG_2].id;
insn_aux->insert_off = regs[BPF_REG_2].var_off.value;
insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
- err = ref_convert_owning_non_owning(env, release_ref_obj_id);
- if (err) {
- verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
- func_name, meta.func_id);
- return err;
- }
-
- err = release_reference(env, release_ref_obj_id);
- if (err) {
- verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, meta.func_id);
- return err;
- }
+ ref_convert_owning_non_owning(env, id);
}
if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
@@ -13197,6 +13095,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
bpf_mark_reg_not_init(env, &regs[regno]);
regs[regno].subreg_def = DEF_NOT_SUBREG;
}
+ invalidate_outgoing_stack_args(env, cur_func(env));
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
@@ -13254,8 +13153,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].type |= MEM_RDONLY;
/* Ensures we don't access the memory after a release_reference() */
- if (meta.ref_obj_id)
- regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ if (meta.ref_obj.id) {
+ err = validate_ref_obj(env, &meta.ref_obj);
+ if (err)
+ return err;
+ regs[BPF_REG_0].parent_id = meta.ref_obj.id;
+ }
if (is_kfunc_rcu_protected(&meta))
regs[BPF_REG_0].type |= MEM_RCU;
@@ -13301,13 +13204,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
if (is_kfunc_acquire(&meta)) {
- int id = acquire_reference(env, insn_idx);
-
+ id = acquire_reference(env, insn_idx, 0);
if (id < 0)
return id;
- if (is_kfunc_ret_null(&meta))
- regs[BPF_REG_0].id = id;
- regs[BPF_REG_0].ref_obj_id = id;
+ regs[BPF_REG_0].id = id;
} else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
ref_set_non_owning(env, &regs[BPF_REG_0]);
}
@@ -13329,8 +13229,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
clear_all_pkt_pointers(env);
nargs = btf_type_vlen(meta.func_proto);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ struct bpf_func_state *caller = cur_func(env);
+ struct bpf_subprog_info *caller_info = &env->subprog_info[caller->subprogno];
+ u16 out_stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS;
+ u16 stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + out_stack_arg_cnt;
+
+ if (stack_arg_cnt > caller_info->stack_arg_cnt)
+ caller_info->stack_arg_cnt = stack_arg_cnt;
+ }
+
args = (const struct btf_param *)(meta.func_proto + 1);
- for (i = 0; i < nargs; i++) {
+ for (i = 0; i < min_t(int, nargs, MAX_BPF_FUNC_REG_ARGS); i++) {
u32 regno = i + 1;
t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
@@ -13350,7 +13260,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie])
env->prog->call_session_cookie = true;
- if (is_bpf_throw_kfunc(insn))
+ if (bpf_is_throw_kfunc(insn))
return process_bpf_exit_full(env, NULL, true);
return 0;
@@ -13362,7 +13272,7 @@ static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env,
{
bool known = tnum_is_const(reg->var_off);
s64 val = reg->var_off.value;
- s64 smin = reg->smin_value;
+ s64 smin = reg_smin(reg);
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
verbose(env, "math between %s pointer and %lld is not allowed\n",
@@ -13391,7 +13301,7 @@ static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env,
{
bool known = tnum_is_const(reg->var_off);
s64 val = reg->var_off.value;
- s64 smin = reg->smin_value;
+ s64 smin = reg_smin(reg);
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
verbose(env, "%s pointer offset %lld is not allowed\n",
@@ -13433,7 +13343,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
break;
case PTR_TO_MAP_VALUE:
max = ptr_reg->map_ptr->value_size;
- ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value;
+ ptr_limit = mask_to_left ? reg_smin(ptr_reg) : reg_umax(ptr_reg);
break;
default:
return REASON_TYPE;
@@ -13522,7 +13432,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
struct bpf_verifier_state *vstate = env->cur_state;
bool off_is_imm = tnum_is_const(off_reg->var_off);
- bool off_is_neg = off_reg->smin_value < 0;
+ bool off_is_neg = reg_smin(off_reg) < 0;
bool ptr_is_dst_reg = ptr_reg == dst_reg;
u8 opcode = BPF_OP(insn->code);
u32 alu_state, alu_limit;
@@ -13541,7 +13451,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
if (!commit_window) {
if (!tnum_is_const(off_reg->var_off) &&
- (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+ (reg_smin(off_reg) < 0) != (reg_smax(off_reg) < 0))
return REASON_BOUNDS;
info->mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
@@ -13597,7 +13507,7 @@ do_sim:
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
- copy_register_state(dst_reg, ptr_reg);
+ *dst_reg = *ptr_reg;
}
err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx);
if (err < 0)
@@ -13691,7 +13601,7 @@ static int check_stack_access_for_ptr_arithmetic(
static int sanitize_check_bounds(struct bpf_verifier_env *env,
const struct bpf_insn *insn,
- const struct bpf_reg_state *dst_reg)
+ struct bpf_reg_state *dst_reg)
{
u32 dst = insn->dst_reg;
@@ -13708,7 +13618,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
return -EACCES;
break;
case PTR_TO_MAP_VALUE:
- if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) {
+ if (check_map_access(env, dst_reg, argno_from_reg(dst), 0, 1, false, ACCESS_HELPER)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
"prohibited for !root\n", dst);
return -EACCES;
@@ -13735,10 +13645,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *dst_reg;
bool known = tnum_is_const(off_reg->var_off);
- s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
- smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
- u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
- umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
+ s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg);
+ u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg);
struct bpf_sanitize_info info = {};
u8 opcode = BPF_OP(insn->code);
u32 dst = insn->dst_reg;
@@ -13840,16 +13748,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* added into the variable offset, and we copy the fixed offset
* from ptr_reg.
*/
- if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
- check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- }
- if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
- check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- }
+ dst_reg->r64 = cnum64_add(ptr_reg->r64, off_reg->r64);
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->raw = ptr_reg->raw;
if (reg_is_pkt_pointer(ptr_reg)) {
@@ -13881,24 +13780,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst);
return -EACCES;
}
- /* A new variable offset is created. If the subtrahend is known
- * nonnegative, then any reg->range we had before is still good.
- */
- if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
- check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
- /* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- }
- if (umin_ptr < umax_val) {
- /* Overflow possible, we know nothing */
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- /* Cannot overflow (as long as bounds are consistent) */
- dst_reg->umin_value = umin_ptr - umax_val;
- dst_reg->umax_value = umax_ptr - umin_val;
- }
+ dst_reg->r64 = cnum64_add(ptr_reg->r64, cnum64_negate(off_reg->r64));
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
dst_reg->raw = ptr_reg->raw;
if (reg_is_pkt_pointer(ptr_reg)) {
@@ -13955,227 +13837,123 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 *dst_smin = &dst_reg->s32_min_value;
- s32 *dst_smax = &dst_reg->s32_max_value;
- u32 *dst_umin = &dst_reg->u32_min_value;
- u32 *dst_umax = &dst_reg->u32_max_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
- bool min_overflow, max_overflow;
-
- if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
- check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
- *dst_smin = S32_MIN;
- *dst_smax = S32_MAX;
- }
-
- /* If either all additions overflow or no additions overflow, then
- * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
- * dst_umax + src_umax. Otherwise (some additions overflow), set
- * the output bounds to unbounded.
- */
- min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
- max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
-
- if (!min_overflow && max_overflow) {
- *dst_umin = 0;
- *dst_umax = U32_MAX;
- }
+ dst_reg->r32 = cnum32_add(dst_reg->r32, src_reg->r32);
}
static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 *dst_smin = &dst_reg->smin_value;
- s64 *dst_smax = &dst_reg->smax_value;
- u64 *dst_umin = &dst_reg->umin_value;
- u64 *dst_umax = &dst_reg->umax_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
- bool min_overflow, max_overflow;
-
- if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
- check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
- *dst_smin = S64_MIN;
- *dst_smax = S64_MAX;
- }
-
- /* If either all additions overflow or no additions overflow, then
- * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
- * dst_umax + src_umax. Otherwise (some additions overflow), set
- * the output bounds to unbounded.
- */
- min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
- max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
-
- if (!min_overflow && max_overflow) {
- *dst_umin = 0;
- *dst_umax = U64_MAX;
- }
+ dst_reg->r64 = cnum64_add(dst_reg->r64, src_reg->r64);
}
static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 *dst_smin = &dst_reg->s32_min_value;
- s32 *dst_smax = &dst_reg->s32_max_value;
- u32 *dst_umin = &dst_reg->u32_min_value;
- u32 *dst_umax = &dst_reg->u32_max_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
- bool min_underflow, max_underflow;
-
- if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
- check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
- /* Overflow possible, we know nothing */
- *dst_smin = S32_MIN;
- *dst_smax = S32_MAX;
- }
-
- /* If either all subtractions underflow or no subtractions
- * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
- * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
- * underflow), set the output bounds to unbounded.
- */
- min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
- max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
-
- if (min_underflow && !max_underflow) {
- *dst_umin = 0;
- *dst_umax = U32_MAX;
- }
+ dst_reg->r32 = cnum32_add(dst_reg->r32, cnum32_negate(src_reg->r32));
}
static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 *dst_smin = &dst_reg->smin_value;
- s64 *dst_smax = &dst_reg->smax_value;
- u64 *dst_umin = &dst_reg->umin_value;
- u64 *dst_umax = &dst_reg->umax_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
- bool min_underflow, max_underflow;
-
- if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
- check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
- /* Overflow possible, we know nothing */
- *dst_smin = S64_MIN;
- *dst_smax = S64_MAX;
- }
-
- /* If either all subtractions underflow or no subtractions
- * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
- * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
- * underflow), set the output bounds to unbounded.
- */
- min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
- max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
-
- if (min_underflow && !max_underflow) {
- *dst_umin = 0;
- *dst_umax = U64_MAX;
- }
+ dst_reg->r64 = cnum64_add(dst_reg->r64, cnum64_negate(src_reg->r64));
}
static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 *dst_smin = &dst_reg->s32_min_value;
- s32 *dst_smax = &dst_reg->s32_max_value;
- u32 *dst_umin = &dst_reg->u32_min_value;
- u32 *dst_umax = &dst_reg->u32_max_value;
+ s32 smin = reg_s32_min(dst_reg);
+ s32 smax = reg_s32_max(dst_reg);
+ u32 umin = reg_u32_min(dst_reg);
+ u32 umax = reg_u32_max(dst_reg);
s32 tmp_prod[4];
- if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) ||
- check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) {
+ if (check_mul_overflow(umax, reg_u32_max(src_reg), &umax) ||
+ check_mul_overflow(umin, reg_u32_min(src_reg), &umin)) {
/* Overflow possible, we know nothing */
- *dst_umin = 0;
- *dst_umax = U32_MAX;
+ umin = 0;
+ umax = U32_MAX;
}
- if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) ||
- check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) ||
- check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) ||
- check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) {
+ if (check_mul_overflow(smin, reg_s32_min(src_reg), &tmp_prod[0]) ||
+ check_mul_overflow(smin, reg_s32_max(src_reg), &tmp_prod[1]) ||
+ check_mul_overflow(smax, reg_s32_min(src_reg), &tmp_prod[2]) ||
+ check_mul_overflow(smax, reg_s32_max(src_reg), &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- *dst_smin = S32_MIN;
- *dst_smax = S32_MAX;
+ smin = S32_MIN;
+ smax = S32_MAX;
} else {
- *dst_smin = min_array(tmp_prod, 4);
- *dst_smax = max_array(tmp_prod, 4);
+ smin = min_array(tmp_prod, 4);
+ smax = max_array(tmp_prod, 4);
}
+
+ dst_reg->r32 = cnum32_intersect(cnum32_from_urange(umin, umax),
+ cnum32_from_srange(smin, smax));
}
static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 *dst_smin = &dst_reg->smin_value;
- s64 *dst_smax = &dst_reg->smax_value;
- u64 *dst_umin = &dst_reg->umin_value;
- u64 *dst_umax = &dst_reg->umax_value;
+ s64 smin = reg_smin(dst_reg);
+ s64 smax = reg_smax(dst_reg);
+ u64 umin = reg_umin(dst_reg);
+ u64 umax = reg_umax(dst_reg);
s64 tmp_prod[4];
- if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) ||
- check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) {
+ if (check_mul_overflow(umax, reg_umax(src_reg), &umax) ||
+ check_mul_overflow(umin, reg_umin(src_reg), &umin)) {
/* Overflow possible, we know nothing */
- *dst_umin = 0;
- *dst_umax = U64_MAX;
+ umin = 0;
+ umax = U64_MAX;
}
- if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) ||
- check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) ||
- check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) ||
- check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) {
+ if (check_mul_overflow(smin, reg_smin(src_reg), &tmp_prod[0]) ||
+ check_mul_overflow(smin, reg_smax(src_reg), &tmp_prod[1]) ||
+ check_mul_overflow(smax, reg_smin(src_reg), &tmp_prod[2]) ||
+ check_mul_overflow(smax, reg_smax(src_reg), &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- *dst_smin = S64_MIN;
- *dst_smax = S64_MAX;
+ smin = S64_MIN;
+ smax = S64_MAX;
} else {
- *dst_smin = min_array(tmp_prod, 4);
- *dst_smax = max_array(tmp_prod, 4);
+ smin = min_array(tmp_prod, 4);
+ smax = max_array(tmp_prod, 4);
}
+
+ dst_reg->r64 = cnum64_intersect(cnum64_from_urange(umin, umax),
+ cnum64_from_srange(smin, smax));
}
static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u32 *dst_umin = &dst_reg->u32_min_value;
- u32 *dst_umax = &dst_reg->u32_max_value;
- u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+ u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */
- *dst_umin = *dst_umin / src_val;
- *dst_umax = *dst_umax / src_val;
+ reg_set_urange32(dst_reg, reg_u32_min(dst_reg) / src_val,
+ reg_u32_max(dst_reg) / src_val);
/* Reset other ranges/tnum to unbounded/unknown. */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
reset_reg64_and_tnum(dst_reg);
}
static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u64 *dst_umin = &dst_reg->umin_value;
- u64 *dst_umax = &dst_reg->umax_value;
- u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+ u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */
- *dst_umin = div64_u64(*dst_umin, src_val);
- *dst_umax = div64_u64(*dst_umax, src_val);
+ reg_set_urange64(dst_reg, div64_u64(reg_umin(dst_reg), src_val),
+ div64_u64(reg_umax(dst_reg), src_val));
/* Reset other ranges/tnum to unbounded/unknown. */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
reset_reg32_and_tnum(dst_reg);
}
static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 *dst_smin = &dst_reg->s32_min_value;
- s32 *dst_smax = &dst_reg->s32_max_value;
- s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+ s32 smin = reg_s32_min(dst_reg);
+ s32 smax = reg_s32_max(dst_reg);
+ s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */
s32 res1, res2;
/* BPF div specification: S32_MIN / -1 = S32_MIN */
- if (*dst_smin == S32_MIN && src_val == -1) {
+ if (smin == S32_MIN && src_val == -1) {
/*
* If the dividend range contains more than just S32_MIN,
* we cannot precisely track the result, so it becomes unbounded.
@@ -14184,35 +13962,34 @@ static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg,
* = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX]
* Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN.
*/
- if (*dst_smax != S32_MIN) {
- *dst_smin = S32_MIN;
- *dst_smax = S32_MAX;
+ if (smax != S32_MIN) {
+ smin = S32_MIN;
+ smax = S32_MAX;
}
goto reset;
}
- res1 = *dst_smin / src_val;
- res2 = *dst_smax / src_val;
- *dst_smin = min(res1, res2);
- *dst_smax = max(res1, res2);
+ res1 = smin / src_val;
+ res2 = smax / src_val;
+ smin = min(res1, res2);
+ smax = max(res1, res2);
reset:
+ reg_set_srange32(dst_reg, smin, smax);
/* Reset other ranges/tnum to unbounded/unknown. */
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
reset_reg64_and_tnum(dst_reg);
}
static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 *dst_smin = &dst_reg->smin_value;
- s64 *dst_smax = &dst_reg->smax_value;
- s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+ s64 smin = reg_smin(dst_reg);
+ s64 smax = reg_smax(dst_reg);
+ s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */
s64 res1, res2;
/* BPF div specification: S64_MIN / -1 = S64_MIN */
- if (*dst_smin == S64_MIN && src_val == -1) {
+ if (smin == S64_MIN && src_val == -1) {
/*
* If the dividend range contains more than just S64_MIN,
* we cannot precisely track the result, so it becomes unbounded.
@@ -14221,79 +13998,66 @@ static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg,
* = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX]
* Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN.
*/
- if (*dst_smax != S64_MIN) {
- *dst_smin = S64_MIN;
- *dst_smax = S64_MAX;
+ if (smax != S64_MIN) {
+ smin = S64_MIN;
+ smax = S64_MAX;
}
goto reset;
}
- res1 = div64_s64(*dst_smin, src_val);
- res2 = div64_s64(*dst_smax, src_val);
- *dst_smin = min(res1, res2);
- *dst_smax = max(res1, res2);
+ res1 = div64_s64(smin, src_val);
+ res2 = div64_s64(smax, src_val);
+ smin = min(res1, res2);
+ smax = max(res1, res2);
reset:
+ reg_set_srange64(dst_reg, smin, smax);
/* Reset other ranges/tnum to unbounded/unknown. */
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
reset_reg32_and_tnum(dst_reg);
}
static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u32 *dst_umin = &dst_reg->u32_min_value;
- u32 *dst_umax = &dst_reg->u32_max_value;
- u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+ u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */
u32 res_max = src_val - 1;
/*
* If dst_umax <= res_max, the result remains unchanged.
* e.g., [2, 5] % 10 = [2, 5].
*/
- if (*dst_umax <= res_max)
+ if (reg_u32_max(dst_reg) <= res_max)
return;
- *dst_umin = 0;
- *dst_umax = min(*dst_umax, res_max);
+ reg_set_urange32(dst_reg, 0, min(reg_u32_max(dst_reg), res_max));
/* Reset other ranges/tnum to unbounded/unknown. */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
reset_reg64_and_tnum(dst_reg);
}
static void scalar_min_max_umod(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u64 *dst_umin = &dst_reg->umin_value;
- u64 *dst_umax = &dst_reg->umax_value;
- u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+ u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */
u64 res_max = src_val - 1;
/*
* If dst_umax <= res_max, the result remains unchanged.
* e.g., [2, 5] % 10 = [2, 5].
*/
- if (*dst_umax <= res_max)
+ if (reg_umax(dst_reg) <= res_max)
return;
- *dst_umin = 0;
- *dst_umax = min(*dst_umax, res_max);
+ reg_set_urange64(dst_reg, 0, min(reg_umax(dst_reg), res_max));
/* Reset other ranges/tnum to unbounded/unknown. */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
reset_reg32_and_tnum(dst_reg);
}
static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 *dst_smin = &dst_reg->s32_min_value;
- s32 *dst_smax = &dst_reg->s32_max_value;
- s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+ s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */
/*
* Safe absolute value calculation:
@@ -14313,33 +14077,26 @@ static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg,
* If the dividend is already within the result range,
* the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
*/
- if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+ if (reg_s32_min(dst_reg) >= -res_max_abs && reg_s32_max(dst_reg) <= res_max_abs)
return;
/* General case: result has the same sign as the dividend. */
- if (*dst_smin >= 0) {
- *dst_smin = 0;
- *dst_smax = min(*dst_smax, res_max_abs);
- } else if (*dst_smax <= 0) {
- *dst_smax = 0;
- *dst_smin = max(*dst_smin, -res_max_abs);
+ if (reg_s32_min(dst_reg) >= 0) {
+ reg_set_srange32(dst_reg, 0, min(reg_s32_max(dst_reg), res_max_abs));
+ } else if (reg_s32_max(dst_reg) <= 0) {
+ reg_set_srange32(dst_reg, max(reg_s32_min(dst_reg), -res_max_abs), 0);
} else {
- *dst_smin = -res_max_abs;
- *dst_smax = res_max_abs;
+ reg_set_srange32(dst_reg, -res_max_abs, res_max_abs);
}
/* Reset other ranges/tnum to unbounded/unknown. */
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
reset_reg64_and_tnum(dst_reg);
}
static void scalar_min_max_smod(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 *dst_smin = &dst_reg->smin_value;
- s64 *dst_smax = &dst_reg->smax_value;
- s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+ s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */
/*
* Safe absolute value calculation:
@@ -14359,24 +14116,19 @@ static void scalar_min_max_smod(struct bpf_reg_state *dst_reg,
* If the dividend is already within the result range,
* the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
*/
- if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+ if (reg_smin(dst_reg) >= -res_max_abs && reg_smax(dst_reg) <= res_max_abs)
return;
/* General case: result has the same sign as the dividend. */
- if (*dst_smin >= 0) {
- *dst_smin = 0;
- *dst_smax = min(*dst_smax, res_max_abs);
- } else if (*dst_smax <= 0) {
- *dst_smax = 0;
- *dst_smin = max(*dst_smin, -res_max_abs);
+ if (reg_smin(dst_reg) >= 0) {
+ reg_set_srange64(dst_reg, 0, min(reg_smax(dst_reg), res_max_abs));
+ } else if (reg_smax(dst_reg) <= 0) {
+ reg_set_srange64(dst_reg, max(reg_smin(dst_reg), -res_max_abs), 0);
} else {
- *dst_smin = -res_max_abs;
- *dst_smax = res_max_abs;
+ reg_set_srange64(dst_reg, -res_max_abs, res_max_abs);
}
/* Reset other ranges/tnum to unbounded/unknown. */
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
reset_reg32_and_tnum(dst_reg);
}
@@ -14386,7 +14138,7 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- u32 umax_val = src_reg->u32_max_value;
+ u32 umax_val = reg_u32_max(src_reg);
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
@@ -14396,19 +14148,9 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
/* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima.
*/
- dst_reg->u32_min_value = var32_off.value;
- dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
-
- /* Safe to set s32 bounds by casting u32 result into s32 when u32
- * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
- */
- if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
- dst_reg->s32_min_value = dst_reg->u32_min_value;
- dst_reg->s32_max_value = dst_reg->u32_max_value;
- } else {
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- }
+ reg_set_urange32(dst_reg,
+ var32_off.value,
+ min(reg_u32_max(dst_reg), umax_val));
}
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
@@ -14416,7 +14158,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- u64 umax_val = src_reg->umax_value;
+ u64 umax_val = reg_umax(src_reg);
if (src_known && dst_known) {
__mark_reg_known(dst_reg, dst_reg->var_off.value);
@@ -14426,19 +14168,10 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
/* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima.
*/
- dst_reg->umin_value = dst_reg->var_off.value;
- dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+ reg_set_urange64(dst_reg,
+ dst_reg->var_off.value,
+ min(reg_umax(dst_reg), umax_val));
- /* Safe to set s64 bounds by casting u64 result into s64 when u64
- * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
- */
- if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
- dst_reg->smin_value = dst_reg->umin_value;
- dst_reg->smax_value = dst_reg->umax_value;
- } else {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- }
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
}
@@ -14449,7 +14182,7 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- u32 umin_val = src_reg->u32_min_value;
+ u32 umin_val = reg_u32_min(src_reg);
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
@@ -14459,19 +14192,9 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
/* We get our maximum from the var_off, and our minimum is the
* maximum of the operands' minima
*/
- dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
- dst_reg->u32_max_value = var32_off.value | var32_off.mask;
-
- /* Safe to set s32 bounds by casting u32 result into s32 when u32
- * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
- */
- if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
- dst_reg->s32_min_value = dst_reg->u32_min_value;
- dst_reg->s32_max_value = dst_reg->u32_max_value;
- } else {
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- }
+ reg_set_urange32(dst_reg,
+ max(reg_u32_min(dst_reg), umin_val),
+ var32_off.value | var32_off.mask);
}
static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
@@ -14479,7 +14202,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- u64 umin_val = src_reg->umin_value;
+ u64 umin_val = reg_umin(src_reg);
if (src_known && dst_known) {
__mark_reg_known(dst_reg, dst_reg->var_off.value);
@@ -14489,19 +14212,10 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
/* We get our maximum from the var_off, and our minimum is the
* maximum of the operands' minima
*/
- dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
- dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
+ reg_set_urange64(dst_reg,
+ max(reg_umin(dst_reg), umin_val),
+ dst_reg->var_off.value | dst_reg->var_off.mask);
- /* Safe to set s64 bounds by casting u64 result into s64 when u64
- * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
- */
- if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
- dst_reg->smin_value = dst_reg->umin_value;
- dst_reg->smax_value = dst_reg->umax_value;
- } else {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- }
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
}
@@ -14519,19 +14233,7 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
}
/* We get both minimum and maximum from the var32_off. */
- dst_reg->u32_min_value = var32_off.value;
- dst_reg->u32_max_value = var32_off.value | var32_off.mask;
-
- /* Safe to set s32 bounds by casting u32 result into s32 when u32
- * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
- */
- if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
- dst_reg->s32_min_value = dst_reg->u32_min_value;
- dst_reg->s32_max_value = dst_reg->u32_max_value;
- } else {
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- }
+ reg_set_urange32(dst_reg, var32_off.value, var32_off.value | var32_off.mask);
}
static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
@@ -14547,46 +14249,30 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
}
/* We get both minimum and maximum from the var_off. */
- dst_reg->umin_value = dst_reg->var_off.value;
- dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
-
- /* Safe to set s64 bounds by casting u64 result into s64 when u64
- * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
- */
- if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
- dst_reg->smin_value = dst_reg->umin_value;
- dst_reg->smax_value = dst_reg->umax_value;
- } else {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- }
-
- __update_reg_bounds(dst_reg);
+ reg_set_urange64(dst_reg,
+ dst_reg->var_off.value,
+ dst_reg->var_off.value | dst_reg->var_off.mask);
}
static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
u64 umin_val, u64 umax_val)
{
- /* We lose all sign bit information (except what we can pick
- * up from var_off)
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
/* If we might shift our top bit out, then we know nothing */
- if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
- } else {
- dst_reg->u32_min_value <<= umin_val;
- dst_reg->u32_max_value <<= umax_val;
- }
+ if (umax_val > 31 || reg_u32_max(dst_reg) > 1ULL << (31 - umax_val))
+ reg_set_urange32(dst_reg, 0, U32_MAX);
+ else
+ /* We lose all sign bit information (except what we can pick
+ * up from var_off)
+ */
+ reg_set_urange32(dst_reg, reg_u32_min(dst_reg) << umin_val,
+ reg_u32_max(dst_reg) << umax_val);
}
static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u32 umax_val = src_reg->u32_max_value;
- u32 umin_val = src_reg->u32_min_value;
+ u32 umax_val = reg_u32_max(src_reg);
+ u32 umin_val = reg_u32_min(src_reg);
/* u32 alu operation will zext upper bits */
struct tnum subreg = tnum_subreg(dst_reg->var_off);
@@ -14603,34 +14289,34 @@ static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
u64 umin_val, u64 umax_val)
{
+ struct cnum64 u, s;
+
/* Special case <<32 because it is a common compiler pattern to sign
* extend subreg by doing <<32 s>>32. smin/smax assignments are correct
* because s32 bounds don't flip sign when shifting to the left by
* 32bits.
*/
- if (umin_val == 32 && umax_val == 32) {
- dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
- dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
- } else {
- dst_reg->smax_value = S64_MAX;
- dst_reg->smin_value = S64_MIN;
- }
+ if (umin_val == 32 && umax_val == 32)
+ s = cnum64_from_srange((s64)reg_s32_min(dst_reg) << 32,
+ (s64)reg_s32_max(dst_reg) << 32);
+ else
+ s = CNUM64_UNBOUNDED;
/* If we might shift our top bit out, then we know nothing */
- if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value <<= umin_val;
- dst_reg->umax_value <<= umax_val;
- }
+ if (reg_umax(dst_reg) > 1ULL << (63 - umax_val))
+ u = CNUM64_UNBOUNDED;
+ else
+ u = cnum64_from_urange(reg_umin(dst_reg) << umin_val,
+ reg_umax(dst_reg) << umax_val);
+
+ dst_reg->r64 = cnum64_intersect(u, s);
}
static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u64 umax_val = src_reg->umax_value;
- u64 umin_val = src_reg->umin_value;
+ u64 umax_val = reg_umax(src_reg);
+ u64 umin_val = reg_umin(src_reg);
/* scalar64 calc uses 32bit unshifted bounds so must be called first */
__scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
@@ -14645,8 +14331,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
struct tnum subreg = tnum_subreg(dst_reg->var_off);
- u32 umax_val = src_reg->u32_max_value;
- u32 umin_val = src_reg->u32_min_value;
+ u32 umax_val = reg_u32_max(src_reg);
+ u32 umin_val = reg_u32_min(src_reg);
/* BPF_RSH is an unsigned shift. If the value in dst_reg might
* be negative, then either:
@@ -14662,12 +14348,10 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
* and rely on inferring new ones from the unsigned bounds and
* var_off of the result.
*/
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
dst_reg->var_off = tnum_rshift(subreg, umin_val);
- dst_reg->u32_min_value >>= umax_val;
- dst_reg->u32_max_value >>= umin_val;
+ reg_set_urange32(dst_reg, reg_u32_min(dst_reg) >> umax_val,
+ reg_u32_max(dst_reg) >> umin_val);
__mark_reg64_unbounded(dst_reg);
__update_reg32_bounds(dst_reg);
@@ -14676,8 +14360,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u64 umax_val = src_reg->umax_value;
- u64 umin_val = src_reg->umin_value;
+ u64 umax_val = reg_umax(src_reg);
+ u64 umin_val = reg_umin(src_reg);
/* BPF_RSH is an unsigned shift. If the value in dst_reg might
* be negative, then either:
@@ -14693,11 +14377,9 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
* and rely on inferring new ones from the unsigned bounds and
* var_off of the result.
*/
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
- dst_reg->umin_value >>= umax_val;
- dst_reg->umax_value >>= umin_val;
+ reg_set_urange64(dst_reg, reg_umin(dst_reg) >> umax_val,
+ reg_umax(dst_reg) >> umin_val);
/* Its not easy to operate on alu32 bounds here because it depends
* on bits being shifted in. Take easy way out and mark unbounded
@@ -14710,22 +14392,19 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u64 umin_val = src_reg->u32_min_value;
+ u64 umin_val = reg_u32_min(src_reg);
/* Upon reaching here, src_known is true and
* umax_val is equal to umin_val.
+ * Blow away the dst_reg umin_value/umax_value and rely on
+ * dst_reg var_off to refine the result.
*/
- dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
- dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);
+ reg_set_srange32(dst_reg,
+ (u32)(((s32)reg_s32_min(dst_reg)) >> umin_val),
+ (u32)(((s32)reg_s32_max(dst_reg)) >> umin_val));
dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);
- /* blow away the dst_reg umin_value/umax_value and rely on
- * dst_reg var_off to refine the result.
- */
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
-
__mark_reg64_unbounded(dst_reg);
__update_reg32_bounds(dst_reg);
}
@@ -14733,22 +14412,16 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- u64 umin_val = src_reg->umin_value;
+ u64 umin_val = reg_umin(src_reg);
/* Upon reaching here, src_known is true and umax_val is equal
* to umin_val.
*/
- dst_reg->smin_value >>= umin_val;
- dst_reg->smax_value >>= umin_val;
+ reg_set_srange64(dst_reg, reg_smin(dst_reg) >> umin_val,
+ reg_smax(dst_reg) >> umin_val);
dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);
- /* blow away the dst_reg umin_value/umax_value and rely on
- * dst_reg var_off to refine the result.
- */
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
-
/* Its not easy to operate on alu32 bounds here because it depends
* on bits being shifted in from upper 32-bits. Take easy way out
* and mark unbounded so we can recalculate later from tnum.
@@ -14814,13 +14487,13 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
if (insn_bitness == 32) {
if (tnum_subreg_is_const(src_reg->var_off)
- && src_reg->s32_min_value == src_reg->s32_max_value
- && src_reg->u32_min_value == src_reg->u32_max_value)
+ && reg_s32_min(src_reg) == reg_s32_max(src_reg)
+ && reg_u32_min(src_reg) == reg_u32_max(src_reg))
src_is_const = true;
} else {
if (tnum_is_const(src_reg->var_off)
- && src_reg->smin_value == src_reg->smax_value
- && src_reg->umin_value == src_reg->umax_value)
+ && reg_smin(src_reg) == reg_smax(src_reg)
+ && reg_umin(src_reg) == reg_umax(src_reg))
src_is_const = true;
}
@@ -14850,7 +14523,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
case BPF_LSH:
case BPF_RSH:
case BPF_ARSH:
- return (src_is_const && src_reg->umax_value < insn_bitness);
+ return (src_is_const && reg_umax(src_reg) < insn_bitness);
default:
return false;
}
@@ -14863,9 +14536,9 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins
struct bpf_reg_state *regs;
bool alu32;
- if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0)
+ if (reg_smin(dst_reg) == -1 && reg_smax(dst_reg) == 0)
alu32 = false;
- else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0)
+ else if (reg_s32_min(dst_reg) == -1 && reg_s32_max(dst_reg) == 0)
alu32 = true;
else
return 0;
@@ -14949,7 +14622,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
break;
case BPF_DIV:
/* BPF div specification: x / 0 = 0 */
- if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) {
+ if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) {
___mark_reg_known(dst_reg, 0);
break;
}
@@ -14966,7 +14639,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
break;
case BPF_MOD:
/* BPF mod specification: x % 0 = x */
- if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0))
+ if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0))
break;
if (alu32)
if (off == 1)
@@ -15154,7 +14827,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* umax_value before the ALU operation. After adjust_scalar_min_max_vals(),
* alu32 ops will have zero-extended the result, making umax_value <= U32_MAX.
*/
- u64 dst_umax = dst_reg->umax_value;
+ u64 dst_umax = reg_umax(dst_reg);
err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
if (err)
@@ -15284,7 +14957,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* copy register state to dest reg
*/
assign_scalar_id_before_mov(env, src_reg);
- copy_register_state(dst_reg, src_reg);
+ *dst_reg = *src_reg;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
/* case: R1 = (s8, s16 s32)R2 */
@@ -15296,10 +14969,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (src_reg->type == SCALAR_VALUE) {
bool no_sext;
- no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+ no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1));
if (no_sext)
assign_scalar_id_before_mov(env, src_reg);
- copy_register_state(dst_reg, src_reg);
+ *dst_reg = *src_reg;
if (!no_sext)
clear_scalar_id(dst_reg);
coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
@@ -15321,7 +14994,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (is_src_reg_u32)
assign_scalar_id_before_mov(env, src_reg);
- copy_register_state(dst_reg, src_reg);
+ *dst_reg = *src_reg;
/* Make sure ID is cleared if src_reg is not in u32
* range otherwise dst_reg min/max could be incorrectly
* propagated into src_reg by sync_linked_regs()
@@ -15331,11 +15004,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
dst_reg->subreg_def = env->insn_idx + 1;
} else {
/* case: W1 = (s8, s16)W2 */
- bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+ bool no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1));
if (no_sext)
assign_scalar_id_before_mov(env, src_reg);
- copy_register_state(dst_reg, src_reg);
+ *dst_reg = *src_reg;
if (!no_sext)
clear_scalar_id(dst_reg);
dst_reg->subreg_def = env->insn_idx + 1;
@@ -15413,17 +15086,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *reg;
int new_range;
- if (dst_reg->umax_value == 0 && range_right_open)
+ if (reg_umax(dst_reg) == 0 && range_right_open)
/* This doesn't give us any range */
return;
- if (dst_reg->umax_value > MAX_PACKET_OFF)
+ if (reg_umax(dst_reg) > MAX_PACKET_OFF)
/* Risk of overflow. For instance, ptr + (1<<63) may be less
* than pkt_end, but that's because it's also less than pkt.
*/
return;
- new_range = dst_reg->umax_value;
+ new_range = reg_umax(dst_reg);
if (range_right_open)
new_range++;
@@ -15472,7 +15145,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
/* If our ids match, then we must have the same max_value. And we
* don't care about the other reg's fixed offset, since if it's too big
* the range won't allow anything.
- * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ * reg_umax(dst_reg) is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->type == type && reg->id == dst_reg->id)
@@ -15528,14 +15201,14 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s
{
struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
- u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
- u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
- s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
- s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
- u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
- u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
- s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
- s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
+ u64 umin1 = is_jmp32 ? (u64)reg_u32_min(reg1) : reg_umin(reg1);
+ u64 umax1 = is_jmp32 ? (u64)reg_u32_max(reg1) : reg_umax(reg1);
+ s64 smin1 = is_jmp32 ? (s64)reg_s32_min(reg1) : reg_smin(reg1);
+ s64 smax1 = is_jmp32 ? (s64)reg_s32_max(reg1) : reg_smax(reg1);
+ u64 umin2 = is_jmp32 ? (u64)reg_u32_min(reg2) : reg_umin(reg2);
+ u64 umax2 = is_jmp32 ? (u64)reg_u32_max(reg2) : reg_umax(reg2);
+ s64 smin2 = is_jmp32 ? (s64)reg_s32_min(reg2) : reg_smin(reg2);
+ s64 smax2 = is_jmp32 ? (s64)reg_s32_max(reg2) : reg_smax(reg2);
if (reg1 == reg2) {
switch (opcode) {
@@ -15580,11 +15253,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s
* utilize 32-bit subrange knowledge to eliminate
* branches that can't be taken a priori
*/
- if (reg1->u32_min_value > reg2->u32_max_value ||
- reg1->u32_max_value < reg2->u32_min_value)
+ if (reg_u32_min(reg1) > reg_u32_max(reg2) ||
+ reg_u32_max(reg1) < reg_u32_min(reg2))
return 0;
- if (reg1->s32_min_value > reg2->s32_max_value ||
- reg1->s32_max_value < reg2->s32_min_value)
+ if (reg_s32_min(reg1) > reg_s32_max(reg2) ||
+ reg_s32_max(reg1) < reg_s32_min(reg2))
return 0;
}
break;
@@ -15606,11 +15279,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s
* utilize 32-bit subrange knowledge to eliminate
* branches that can't be taken a priori
*/
- if (reg1->u32_min_value > reg2->u32_max_value ||
- reg1->u32_max_value < reg2->u32_min_value)
+ if (reg_u32_min(reg1) > reg_u32_max(reg2) ||
+ reg_u32_max(reg1) < reg_u32_min(reg2))
return 1;
- if (reg1->s32_min_value > reg2->s32_max_value ||
- reg1->s32_max_value < reg2->s32_min_value)
+ if (reg_s32_min(reg1) > reg_s32_max(reg2) ||
+ reg_s32_max(reg1) < reg_s32_min(reg2))
return 1;
}
break;
@@ -15765,7 +15438,7 @@ static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *r
if (!is_reg_const(reg2, is_jmp32))
return -1;
- if (!reg_not_null(reg1))
+ if (!reg_not_null(env, reg1))
return -1;
/* If pointer is valid tests against zero will fail so we can
@@ -15837,27 +15510,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
switch (opcode) {
case BPF_JEQ:
if (is_jmp32) {
- reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
- reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
- reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
- reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
- reg2->u32_min_value = reg1->u32_min_value;
- reg2->u32_max_value = reg1->u32_max_value;
- reg2->s32_min_value = reg1->s32_min_value;
- reg2->s32_max_value = reg1->s32_max_value;
+ reg1->r32 = cnum32_intersect(reg1->r32, reg2->r32);
+ reg2->r32 = reg1->r32;
t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
reg1->var_off = tnum_with_subreg(reg1->var_off, t);
reg2->var_off = tnum_with_subreg(reg2->var_off, t);
} else {
- reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
- reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
- reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
- reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
- reg2->umin_value = reg1->umin_value;
- reg2->umax_value = reg1->umax_value;
- reg2->smin_value = reg1->smin_value;
- reg2->smax_value = reg1->smax_value;
+ reg1->r64 = cnum64_intersect(reg1->r64, reg2->r64);
+ reg2->r64 = reg1->r64;
reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
reg2->var_off = reg1->var_off;
@@ -15874,32 +15535,11 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
*/
val = reg_const_value(reg2, is_jmp32);
if (is_jmp32) {
- /* u32_min_value is not equal to 0xffffffff at this point,
- * because otherwise u32_max_value is 0xffffffff as well,
- * in such a case both reg1 and reg2 would be constants,
- * jump would be predicted and regs_refine_cond_op()
- * wouldn't be called.
- *
- * Same reasoning works for all {u,s}{min,max}{32,64} cases
- * below.
- */
- if (reg1->u32_min_value == (u32)val)
- reg1->u32_min_value++;
- if (reg1->u32_max_value == (u32)val)
- reg1->u32_max_value--;
- if (reg1->s32_min_value == (s32)val)
- reg1->s32_min_value++;
- if (reg1->s32_max_value == (s32)val)
- reg1->s32_max_value--;
+ /* Complement of the range [val, val] as cnum32. */
+ cnum32_intersect_with(&reg1->r32, (struct cnum32){ val + 1, U32_MAX - 1 });
} else {
- if (reg1->umin_value == (u64)val)
- reg1->umin_value++;
- if (reg1->umax_value == (u64)val)
- reg1->umax_value--;
- if (reg1->smin_value == (s64)val)
- reg1->smin_value++;
- if (reg1->smax_value == (s64)val)
- reg1->smax_value--;
+ /* Complement of the range [val, val] as cnum64. */
+ cnum64_intersect_with(&reg1->r64, (struct cnum64){ val + 1, U64_MAX - 1 });
}
break;
case BPF_JSET:
@@ -15946,38 +15586,38 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
break;
case BPF_JLE:
if (is_jmp32) {
- reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
- reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+ cnum32_intersect_with_urange(&reg1->r32, 0, reg_u32_max(reg2));
+ cnum32_intersect_with_urange(&reg2->r32, reg_u32_min(reg1), U32_MAX);
} else {
- reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
- reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
+ cnum64_intersect_with_urange(&reg1->r64, 0, reg_umax(reg2));
+ cnum64_intersect_with_urange(&reg2->r64, reg_umin(reg1), U64_MAX);
}
break;
case BPF_JLT:
if (is_jmp32) {
- reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
- reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
+ cnum32_intersect_with_urange(&reg1->r32, 0, reg_u32_max(reg2) - 1);
+ cnum32_intersect_with_urange(&reg2->r32, reg_u32_min(reg1) + 1, U32_MAX);
} else {
- reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
- reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
+ cnum64_intersect_with_urange(&reg1->r64, 0, reg_umax(reg2) - 1);
+ cnum64_intersect_with_urange(&reg2->r64, reg_umin(reg1) + 1, U64_MAX);
}
break;
case BPF_JSLE:
if (is_jmp32) {
- reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
- reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ cnum32_intersect_with_srange(&reg1->r32, S32_MIN, reg_s32_max(reg2));
+ cnum32_intersect_with_srange(&reg2->r32, reg_s32_min(reg1), S32_MAX);
} else {
- reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
- reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
+ cnum64_intersect_with_srange(&reg1->r64, S64_MIN, reg_smax(reg2));
+ cnum64_intersect_with_srange(&reg2->r64, reg_smin(reg1), S64_MAX);
}
break;
case BPF_JSLT:
if (is_jmp32) {
- reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
- reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
+ cnum32_intersect_with_srange(&reg1->r32, S32_MIN, reg_s32_max(reg2) - 1);
+ cnum32_intersect_with_srange(&reg2->r32, reg_s32_min(reg1) + 1, S32_MAX);
} else {
- reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
- reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
+ cnum64_intersect_with_srange(&reg1->r64, S64_MIN, reg_smax(reg2) - 1);
+ cnum64_intersect_with_srange(&reg2->r64, reg_smin(reg1) + 1, S64_MAX);
}
break;
default:
@@ -16015,7 +15655,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0)))
return;
if (is_null) {
- /* We don't need id and ref_obj_id from this point
+ /* We don't need id from this point
* onwards anymore, thus we should better reset it,
* so that state pruning has chances to take effect.
*/
@@ -16027,15 +15667,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
mark_ptr_not_null_reg(reg);
- if (!reg_may_point_to_spin_lock(reg)) {
- /* For not-NULL ptr, reg->ref_obj_id will be reset
- * in release_reference().
- *
- * reg->id is still used by spin_lock ptr. Other
- * than spin_lock ptr type, reg->id can be reset.
- */
- reg->id = 0;
- }
+ /*
+ * reg->id is preserved for object relationship tracking
+ * and spin_lock lock state tracking
+ */
}
}
@@ -16047,10 +15682,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *reg;
- u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- if (ref_obj_id && ref_obj_id == id && is_null)
+ if (is_null && find_reference_state(vstate, id))
/* regs[regno] is in the " == NULL" branch.
* No one could have freed the reference state before
* doing the NULL check.
@@ -16248,7 +15882,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
reg->delta == known_reg->delta) {
s32 saved_subreg_def = reg->subreg_def;
- copy_register_state(reg, known_reg);
+ *reg = *known_reg;
reg->subreg_def = saved_subreg_def;
} else {
s32 saved_subreg_def = reg->subreg_def;
@@ -16259,7 +15893,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
__mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta);
/* reg = known_reg; reg += delta */
- copy_register_state(reg, known_reg);
+ *reg = *known_reg;
/*
* Must preserve off, id and subreg_def flag,
* otherwise another sync_linked_regs() will be incorrect.
@@ -16356,16 +15990,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
if (insn_flags) {
- err = bpf_push_jmp_history(env, this_branch, insn_flags, 0);
+ err = bpf_push_jmp_history(env, this_branch, insn_flags, 0, 0, 0);
if (err)
return err;
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- copy_register_state(&env->false_reg1, dst_reg);
- copy_register_state(&env->false_reg2, src_reg);
- copy_register_state(&env->true_reg1, dst_reg);
- copy_register_state(&env->true_reg2, src_reg);
+ env->false_reg1 = *dst_reg;
+ env->false_reg2 = *src_reg;
+ env->true_reg1 = *dst_reg;
+ env->true_reg2 = *src_reg;
pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32);
if (pred >= 0) {
/* If we get here with a dst_reg pointer type it is because
@@ -16420,7 +16054,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs);
if (linked_regs.cnt > 1) {
- err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ err = bpf_push_jmp_history(env, this_branch, 0, 0, 0, linked_regs_pack(&linked_regs));
if (err)
return err;
}
@@ -16434,11 +16068,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (err)
return err;
- copy_register_state(dst_reg, &env->false_reg1);
- copy_register_state(src_reg, &env->false_reg2);
- copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1);
+ *dst_reg = env->false_reg1;
+ *src_reg = env->false_reg2;
+ other_branch_regs[insn->dst_reg] = env->true_reg1;
if (BPF_SRC(insn->code) == BPF_X)
- copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2);
+ other_branch_regs[insn->src_reg] = env->true_reg2;
if (BPF_SRC(insn->code) == BPF_X &&
src_reg->type == SCALAR_VALUE && src_reg->id &&
@@ -16773,6 +16407,9 @@ static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_TRACE_FSESSION:
+ case BPF_TRACE_FENTRY_MULTI:
+ case BPF_TRACE_FEXIT_MULTI:
+ case BPF_TRACE_FSESSION_MULTI:
*range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
@@ -16889,8 +16526,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
prog->aux->attach_func_proto->type,
NULL);
- if (ret_type && ret_type == reg_type && reg->ref_obj_id)
- return __check_ptr_off_reg(env, reg, regno, false);
+ if (ret_type && ret_type == reg_type && reg_is_referenced(env, reg))
+ return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false);
}
/* eBPF calling convention is such that R0 is used
@@ -16962,6 +16599,10 @@ static int check_global_subprog_return_code(struct bpf_verifier_env *env)
if (err)
return err;
+ /* Pointers to arena are safe to pass between subprograms. */
+ if (is_arena_reg(env, BPF_REG_0))
+ return 0;
+
if (is_pointer_value(env, BPF_REG_0)) {
verbose(env, "R%d leaks addr as return value\n", BPF_REG_0);
return -EACCES;
@@ -17478,16 +17119,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
u32 *pmin_index, u32 *pmax_index)
{
struct bpf_reg_state *reg = reg_state(env, regno);
- u64 min_index = reg->umin_value;
- u64 max_index = reg->umax_value;
+ u64 min_index = reg_umin(reg);
+ u64 max_index = reg_umax(reg);
const u32 size = 8;
if (min_index > (u64) U32_MAX * size) {
- verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value);
+ verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg_umin(reg));
return -ERANGE;
}
if (max_index > (u64) U32_MAX * size) {
- verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value);
+ verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg_umax(reg));
return -ERANGE;
}
@@ -17586,6 +17227,14 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
return check_store_reg(env, insn, false);
case BPF_ST: {
+ /* Handle stack arg write (store immediate) */
+ if (is_stack_arg_st(insn)) {
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+ return check_stack_arg_write(env, state, insn->off, NULL);
+ }
+
enum bpf_reg_type dst_reg_type;
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
@@ -17594,7 +17243,7 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
dst_reg_type = cur_regs(env)[insn->dst_reg].type;
- err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, argno_from_reg(insn->dst_reg),
insn->off, BPF_SIZE(insn->code),
BPF_WRITE, -1, false, false);
if (err)
@@ -17620,6 +17269,8 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
}
}
mark_reg_scratched(env, BPF_REG_0);
+ if (bpf_in_stack_arg_cnt(&env->subprog_info[cur_func(env)->subprogno]))
+ cur_func(env)->no_stack_arg_load = true;
if (insn->src_reg == BPF_PSEUDO_CALL)
return check_func_call(env, insn, &env->insn_idx);
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
@@ -17717,7 +17368,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (bpf_is_jmp_point(env, env->insn_idx)) {
- err = bpf_push_jmp_history(env, state, 0, 0);
+ err = bpf_push_jmp_history(env, state, 0, 0, 0, 0);
if (err)
return err;
}
@@ -18102,11 +17753,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
if (prog->sleepable)
switch (map->map_type) {
case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_RHASH:
case BPF_MAP_TYPE_LRU_HASH:
case BPF_MAP_TYPE_ARRAY:
case BPF_MAP_TYPE_PERCPU_HASH:
case BPF_MAP_TYPE_PERCPU_ARRAY:
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_LPM_TRIE:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
case BPF_MAP_TYPE_RINGBUF:
@@ -18424,11 +18077,12 @@ static int check_and_resolve_insns(struct bpf_verifier_env *env)
return err;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (insn->dst_reg >= MAX_BPF_REG) {
+ if (insn->dst_reg >= MAX_BPF_REG &&
+ !is_stack_arg_st(insn) && !is_stack_arg_stx(insn)) {
verbose(env, "R%d is invalid\n", insn->dst_reg);
return -EINVAL;
}
- if (insn->src_reg >= MAX_BPF_REG) {
+ if (insn->src_reg >= MAX_BPF_REG && !is_stack_arg_ldx(insn)) {
verbose(env, "R%d is invalid\n", insn->src_reg);
return -EINVAL;
}
@@ -18735,7 +18389,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
goto out;
}
}
- for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
+ for (i = BPF_REG_1; i <= min_t(u32, sub->arg_cnt, MAX_BPF_FUNC_REG_ARGS); i++) {
arg = &sub->args[i - BPF_REG_1];
reg = &regs[i];
@@ -18745,9 +18399,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
} else if (arg->arg_type == ARG_ANYTHING) {
reg->type = SCALAR_VALUE;
mark_reg_unknown(env, regs, i);
- } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) {
/* assume unspecial LOCAL dynptr type */
- __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
+ __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen, 0);
} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
reg->type = PTR_TO_MEM;
reg->type |= arg->arg_type &
@@ -18773,11 +18427,17 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_unknown(env, regs, i);
} else {
verifier_bug(env, "unhandled arg#%d type %d",
- i - BPF_REG_1, arg->arg_type);
+ i - BPF_REG_1 + 1, arg->arg_type);
ret = -EFAULT;
goto out;
}
}
+ if (env->prog->type == BPF_PROG_TYPE_EXT && sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "freplace programs with >%d args not supported yet\n",
+ MAX_BPF_FUNC_REG_ARGS);
+ ret = -EINVAL;
+ goto out;
+ }
} else {
/* if main BPF program has associated BTF info, validate that
* it's matching expected signature, and otherwise mark BTF
@@ -18785,8 +18445,11 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
*/
if (env->prog->aux->func_info_aux) {
ret = btf_prepare_func_args(env, 0);
- if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
+ if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) {
env->prog->aux->func_info_aux[0].unreliable = true;
+ sub->arg_cnt = 1;
+ sub->stack_arg_cnt = 0;
+ }
}
/* 1st arg to a function */
@@ -18796,9 +18459,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
/* Acquire references for struct_ops program arguments tagged with "__ref" */
if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
- for (i = 0; i < aux->ctx_arg_info_size; i++)
- aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ?
- acquire_reference(env, 0) : 0;
+ for (i = 0; i < aux->ctx_arg_info_size; i++) {
+ ret = aux->ctx_arg_info[i].refcounted ? acquire_reference(env, 0, 0) : 0;
+ if (ret < 0)
+ goto out;
+
+ aux->ctx_arg_info[i].ref_id = ret;
+ }
}
ret = do_check(env);
@@ -18834,6 +18501,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
struct bpf_prog_aux *aux = env->prog->aux;
struct bpf_func_info_aux *sub_aux;
int i, ret, new_cnt;
+ u32 insn_processed;
if (!aux->func_info)
return 0;
@@ -18848,6 +18516,8 @@ again:
if (!bpf_subprog_is_global(env, i))
continue;
+ insn_processed = env->insn_processed;
+
sub_aux = subprog_aux(env, i);
if (!sub_aux->called || sub_aux->verified)
continue;
@@ -18855,6 +18525,7 @@ again:
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
ret = do_check_common(env, i);
+ env->subprog_info[i].insn_processed = env->insn_processed - insn_processed;
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -18881,10 +18552,12 @@ again:
static int do_check_main(struct bpf_verifier_env *env)
{
+ u32 insn_processed = env->insn_processed;
int ret;
env->insn_idx = 0;
ret = do_check_common(env, 0);
+ env->subprog_info[0].insn_processed = env->insn_processed - insn_processed;
if (!ret)
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return ret;
@@ -18893,19 +18566,20 @@ static int do_check_main(struct bpf_verifier_env *env)
static void print_verification_stats(struct bpf_verifier_env *env)
{
- int i;
+ /* Skip over hidden subprogs which are not verified. */
+ int i, subprog_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
if (env->log.level & BPF_LOG_STATS) {
verbose(env, "verification time %lld usec\n",
div_u64(env->verification_time, 1000));
- verbose(env, "stack depth ");
- for (i = 0; i < env->subprog_cnt; i++) {
- u32 depth = env->subprog_info[i].stack_depth;
-
- verbose(env, "%d", depth);
- if (i + 1 < env->subprog_cnt)
- verbose(env, "+");
- }
+ verbose(env, "stack depth %d", env->subprog_info[0].stack_depth);
+ for (i = 1; i < subprog_cnt; i++)
+ verbose(env, "+%d", env->subprog_info[i].stack_depth);
+ verbose(env, " max %d\n", env->max_stack_depth);
+ verbose(env, "insns processed %d", env->subprog_info[0].insn_processed);
+ for (i = 1; i < subprog_cnt; i++)
+ if (bpf_subprog_is_global(env, i))
+ verbose(env, "+%d", env->subprog_info[i].insn_processed);
verbose(env, "\n");
}
verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
@@ -19127,6 +18801,60 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name)
#endif /* CONFIG_FUNCTION_ERROR_INJECTION */
+static bool is_tracing_multi_id(const struct bpf_prog *prog, u32 btf_id)
+{
+ return is_tracing_multi(prog->expected_attach_type) && bpf_multi_func_btf_id[0] == btf_id;
+}
+
+static int btf_id_allow_sleepable(u32 btf_id, unsigned long addr, const struct bpf_prog *prog,
+ const struct btf *btf)
+{
+ const struct btf_type *t;
+ const char *tname;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ t = btf_type_by_id(btf, btf_id);
+ if (!t)
+ return -EINVAL;
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname)
+ return -EINVAL;
+
+ /*
+ * *.multi sleepable programs will pass initial sleepable check,
+ * the actual attached btf ids are checked later during the link
+ * attachment.
+ */
+ if (is_tracing_multi_id(prog, btf_id))
+ return 0;
+ if (!check_attach_sleepable(btf_id, addr, tname))
+ return 0;
+ /*
+ * fentry/fexit/fmod_ret progs can also be sleepable if they are
+ * in the fmodret id set with the KF_SLEEPABLE flag.
+ */
+ else {
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, prog);
+
+ if (flags && (*flags & KF_SLEEPABLE))
+ return 0;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ /*
+ * LSM progs check that they are attached to bpf_lsm_*() funcs.
+ * Only some of them are sleepable.
+ */
+ if (bpf_lsm_is_sleepable_hook(btf_id))
+ return 0;
+ break;
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
int bpf_check_attach_target(struct bpf_verifier_log *log,
const struct bpf_prog *prog,
const struct bpf_prog *tgt_prog,
@@ -19249,7 +18977,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
prog_extension &&
(tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
- tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) {
+ tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FSESSION ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) {
/* Program extensions can extend all program types
* except fentry/fexit. The reason is the following.
* The fentry/fexit programs are used for performance
@@ -19299,6 +19030,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
btp = bpf_get_raw_tracepoint(tname);
if (!btp)
return -EINVAL;
+ if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) {
+ bpf_log(log, "Sleepable program cannot attach to non-faultable tracepoint %s\n",
+ tname);
+ bpf_put_raw_tracepoint(btp);
+ return -EINVAL;
+ }
fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
trace_symbol);
bpf_put_raw_tracepoint(btp);
@@ -19349,7 +19086,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_TRACE_FSESSION:
- if (prog->expected_attach_type == BPF_TRACE_FSESSION &&
+ case BPF_TRACE_FSESSION_MULTI:
+ case BPF_TRACE_FENTRY_MULTI:
+ case BPF_TRACE_FEXIT_MULTI:
+ if ((prog->expected_attach_type == BPF_TRACE_FSESSION ||
+ prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) &&
!bpf_jit_supports_fsession()) {
bpf_log(log, "JIT does not support fsession\n");
return -EOPNOTSUPP;
@@ -19378,7 +19119,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (ret < 0)
return ret;
- if (tgt_prog) {
+ /*
+ * *.multi programs don't need an address during program
+ * verification, we just take the module ref if needed.
+ */
+ if (is_tracing_multi_id(prog, btf_id)) {
+ if (btf_is_module(btf)) {
+ mod = btf_try_get_module(btf);
+ if (!mod)
+ return -ENOENT;
+ }
+ addr = 0;
+ } else if (tgt_prog) {
if (subprog == 0)
addr = (long) tgt_prog->bpf_func;
else
@@ -19403,32 +19155,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
if (prog->sleepable) {
- ret = -EINVAL;
- switch (prog->type) {
- case BPF_PROG_TYPE_TRACING:
- if (!check_attach_sleepable(btf_id, addr, tname))
- ret = 0;
- /* fentry/fexit/fmod_ret progs can also be sleepable if they are
- * in the fmodret id set with the KF_SLEEPABLE flag.
- */
- else {
- u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
- prog);
-
- if (flags && (*flags & KF_SLEEPABLE))
- ret = 0;
- }
- break;
- case BPF_PROG_TYPE_LSM:
- /* LSM progs check that they are attached to bpf_lsm_*() funcs.
- * Only some of them are sleepable.
- */
- if (bpf_lsm_is_sleepable_hook(btf_id))
- ret = 0;
- break;
- default:
- break;
- }
+ ret = btf_id_allow_sleepable(btf_id, addr, prog, btf);
if (ret) {
module_put(mod);
bpf_log(log, "%s is not sleepable\n", tname);
@@ -19515,14 +19242,22 @@ static bool can_be_sleepable(struct bpf_prog *prog)
case BPF_MODIFY_RETURN:
case BPF_TRACE_ITER:
case BPF_TRACE_FSESSION:
+ case BPF_TRACE_RAW_TP:
+ case BPF_TRACE_FENTRY_MULTI:
+ case BPF_TRACE_FEXIT_MULTI:
+ case BPF_TRACE_FSESSION_MULTI:
return true;
default:
return false;
}
}
- return prog->type == BPF_PROG_TYPE_LSM ||
- prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
- prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+ if (prog->type == BPF_PROG_TYPE_LSM)
+ return prog->expected_attach_type != BPF_LSM_CGROUP;
+
+ return prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
+ prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT ||
+ prog->type == BPF_PROG_TYPE_TRACEPOINT;
}
static int check_attach_btf_id(struct bpf_verifier_env *env)
@@ -19544,7 +19279,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
if (prog->sleepable && !can_be_sleepable(prog)) {
- verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+ verbose(env, "Program of this type cannot be sleepable\n");
return -EINVAL;
}
@@ -19597,6 +19332,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
prog->expected_attach_type == BPF_TRACE_FSESSION ||
+ prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI ||
prog->expected_attach_type == BPF_MODIFY_RETURN) &&
btf_id_set_contains(&noreturn_deny, btf_id)) {
verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n",
@@ -19604,6 +19340,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
+ /*
+ * We don't get trampoline for tracing_multi programs at this point,
+ * it's done when tracing_multi link is created.
+ */
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ is_tracing_multi(prog->expected_attach_type))
+ return 0;
+
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
tr = bpf_trampoline_get(key, &tgt_info);
if (!tr)
@@ -19616,6 +19360,62 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return 0;
}
+int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id,
+ struct bpf_attach_target_info *tgt_info)
+{
+ const struct btf_type *t;
+ unsigned long addr;
+ const char *tname;
+ int err;
+
+ if (!btf_id || !btf)
+ return -EINVAL;
+
+ /* Check noreturn attachment. */
+ if ((prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+ prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) &&
+ btf_id_set_contains(&noreturn_deny, btf_id))
+ return -EINVAL;
+ /* Check denied attachment. */
+ if (btf_id_set_contains(&btf_id_deny, btf_id))
+ return -EINVAL;
+
+ /* Check and get function target data. */
+ t = btf_type_by_id(btf, btf_id);
+ if (!t)
+ return -EINVAL;
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname)
+ return -EINVAL;
+ if (!btf_type_is_func(t))
+ return -EINVAL;
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_func_proto(t))
+ return -EINVAL;
+ err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel);
+ if (err < 0)
+ return err;
+ if (btf_is_module(btf)) {
+ /* The bpf program already holds reference to module. */
+ if (WARN_ON_ONCE(!prog->aux->mod))
+ return -EINVAL;
+ addr = find_kallsyms_symbol_value(prog->aux->mod, tname);
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ }
+ if (!addr || !ftrace_location(addr))
+ return -ENOENT;
+
+ /* Check sleepable program attachment. */
+ if (prog->sleepable) {
+ err = btf_id_allow_sleepable(btf_id, addr, prog, btf);
+ if (err)
+ return err;
+ }
+ tgt_info->tgt_addr = addr;
+ return 0;
+}
+
struct btf *bpf_get_btf_vmlinux(void)
{
if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
@@ -19834,8 +19634,11 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int struct_meta_reg = BPF_REG_3;
int node_offset_reg = BPF_REG_4;
- /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
- if (is_bpf_rbtree_add_kfunc(desc->func_id)) {
+ /* list_add/rbtree_add have an extra arg (prev/less),
+ * so args-to-fixup are in diff regs.
+ */
+ if (desc->func_id == special_kfunc_list[KF_bpf_list_add] ||
+ is_bpf_rbtree_add_kfunc(desc->func_id)) {
struct_meta_reg = BPF_REG_4;
node_offset_reg = BPF_REG_5;
}
@@ -19853,7 +19656,9 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
} else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
- env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ (env->prog->expected_attach_type == BPF_TRACE_FSESSION ||
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) {
+
/*
* inline the bpf_session_is_return() for fsession:
* bool bpf_session_is_return(void *ctx)
@@ -19866,7 +19671,8 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
*cnt = 3;
} else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
- env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ (env->prog->expected_attach_type == BPF_TRACE_FSESSION ||
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) {
/*
* inline bpf_session_cookie() for fsession:
* __u64 *bpf_session_cookie(void *ctx)
@@ -19897,12 +19703,12 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_log_attr *attr_log)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
int i, len, ret = -EINVAL, err;
- u32 log_true_size;
bool is_priv;
BTF_TYPE_EMIT(enum bpf_features);
@@ -19949,9 +19755,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
/* user could have requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- ret = bpf_vlog_init(&env->log, attr->log_level,
- (char __user *) (unsigned long) attr->log_buf,
- attr->log_size);
+ ret = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size);
if (ret)
goto err_unlock;
@@ -20113,17 +19917,10 @@ skip_full_check:
env->prog->aux->verified_insns = env->insn_processed;
/* preserve original error even if log finalization is successful */
- err = bpf_vlog_finalize(&env->log, &log_true_size);
+ err = bpf_log_attr_finalize(attr_log, &env->log);
if (err)
ret = err;
- if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
- copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
- &log_true_size, sizeof(log_true_size))) {
- ret = -EFAULT;
- goto err_release_maps;
- }
-
if (ret)
goto err_release_maps;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 45c0b1ed687a..38f8d9df8fbc 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -197,6 +197,14 @@ static u32 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */
static u32 cgrp_dfl_threaded_ss_mask;
+/*
+ * Set across rebind_subsystems() to the controllers leaving a hierarchy.
+ * Guarded by cgroup_mutex. Makes find_existing_css_set() resolve them to the
+ * root css so the affected tasks are migrated there before
+ * cgroup_apply_control_disable() kills the per-cgroup csses.
+ */
+static u32 cgroup_rebind_ss_mask;
+
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
@@ -264,10 +272,11 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
+static void kill_css_sync(struct cgroup_subsys_state *css);
+static void kill_css_finish(struct cgroup_subsys_state *css);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
-static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
@@ -374,11 +383,6 @@ static void cgroup_idr_remove(struct idr *idr, int id)
spin_unlock_bh(&cgroup_idr_lock);
}
-static bool cgroup_has_tasks(struct cgroup *cgrp)
-{
- return cgrp->nr_populated_csets;
-}
-
static bool cgroup_is_threaded(struct cgroup *cgrp)
{
return cgrp->dom_cgrp != cgrp;
@@ -407,7 +411,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
return false;
/* can only have either domain or threaded children */
- if (cgrp->nr_populated_domain_children)
+ if (READ_ONCE(cgrp->nr_populated_domain_children))
return false;
/* and no domain controllers can be enabled */
@@ -759,52 +763,76 @@ static bool css_set_populated(struct css_set *cset)
}
/**
- * cgroup_update_populated - update the populated count of a cgroup
- * @cgrp: the target cgroup
- * @populated: inc or dec populated count
- *
- * One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
- * count is propagated towards root so that a given cgroup's
- * nr_populated_children is zero iff none of its descendants contain any
- * tasks.
- *
- * @cgrp's interface file "cgroup.populated" is zero if both
- * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
- * 1 otherwise. When the sum changes from or to zero, userland is notified
- * that the content of the interface file has changed. This can be used to
- * detect when @cgrp and its descendants become populated or empty.
+ * css_update_populated - update the populated state of a css and ancestors
+ * @css: leaf css whose own populated count is changing
+ * @populated: inc or dec
+ *
+ * One of the css_sets pinned by @css is getting its first task or losing the
+ * last. Propagate the transition up the parent chain so that a css's
+ * nr_populated_children is zero iff none of its descendants contain any tasks.
+ *
+ * For a cgroup->self walk, also runs cgroup-side bookkeeping at each level:
+ * domain/threaded child split, deferred-destroy trigger, and notification via
+ * "cgroup.populated" (zero iff cgrp->self has neither populated csets nor
+ * populated children; userland is notified on transitions).
*/
-static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+static void css_update_populated(struct cgroup_subsys_state *css, bool populated)
{
- struct cgroup *child = NULL;
+ struct cgroup_subsys_state *child = NULL;
int adj = populated ? 1 : -1;
lockdep_assert_held(&css_set_lock);
do {
- bool was_populated = cgroup_is_populated(cgrp);
+ /* non-NULL only on the cgroup->self walk */
+ struct cgroup *cgrp = css_is_self(css) ? css->cgroup : NULL;
+ bool was_populated = css_is_populated(css);
if (!child) {
- cgrp->nr_populated_csets += adj;
+ WRITE_ONCE(css->nr_populated_csets,
+ css->nr_populated_csets + adj);
} else {
- if (cgroup_is_threaded(child))
- cgrp->nr_populated_threaded_children += adj;
- else
- cgrp->nr_populated_domain_children += adj;
+ WRITE_ONCE(css->nr_populated_children,
+ css->nr_populated_children + adj);
+ if (cgrp) {
+ if (cgroup_is_threaded(child->cgroup))
+ WRITE_ONCE(cgrp->nr_populated_threaded_children,
+ cgrp->nr_populated_threaded_children + adj);
+ else
+ WRITE_ONCE(cgrp->nr_populated_domain_children,
+ cgrp->nr_populated_domain_children + adj);
+ }
}
- if (was_populated == cgroup_is_populated(cgrp))
+ if (was_populated == css_is_populated(css))
break;
- cgroup1_check_for_release(cgrp);
- TRACE_CGROUP_PATH(notify_populated, cgrp,
- cgroup_is_populated(cgrp));
- cgroup_file_notify(&cgrp->events_file);
+ /*
+ * Pair with smp_mb() in kill_css_sync(). Either we observe
+ * CSS_DYING and queue, or the caller observes our decrement
+ * and fires synchronously.
+ */
+ smp_mb();
- child = cgrp;
- cgrp = cgroup_parent(cgrp);
- } while (cgrp);
+ /*
+ * Subtree just emptied below a dying css. Fire deferred kill.
+ * The transition is one-shot for a dying css.
+ */
+ if (was_populated && css_is_dying(css)) {
+ css_get(css);
+ WARN_ON_ONCE(!queue_work(cgroup_offline_wq, &css->kill_finish_work));
+ }
+
+ if (cgrp) {
+ cgroup1_check_for_release(cgrp);
+ TRACE_CGROUP_PATH(notify_populated, cgrp,
+ cgroup_is_populated(cgrp));
+ cgroup_file_notify(&cgrp->events_file);
+ }
+
+ child = css;
+ css = css->parent;
+ } while (css);
}
/**
@@ -812,17 +840,27 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
* @cset: target css_set
* @populated: whether @cset is populated or depopulated
*
- * @cset is either getting the first task or losing the last. Update the
- * populated counters of all associated cgroups accordingly.
+ * @cset is either getting the first task or losing the last. Update the
+ * populated counters along each linked cgroup's self chain and each
+ * subsystem css that @cset pins.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
+ int ssid;
lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
- cgroup_update_populated(link->cgrp, populated);
+ css_update_populated(&link->cgrp->self, populated);
+
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cset->subsys[ssid];
+
+ if (css)
+ css_update_populated(css, populated);
+ }
}
/*
@@ -1053,7 +1091,15 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
- if (root->subsys_mask & (1UL << i)) {
+ if (unlikely(cgroup_rebind_ss_mask & (1UL << i))) {
+ /*
+ * @ss is leaving this hierarchy and its per-cgroup
+ * csses are about to be killed. Resolve to the
+ * surviving root css so the tasks are migrated there.
+ */
+ template[i] = cgroup_css(&root->cgrp, ss);
+ WARN_ON_ONCE(!template[i]);
+ } else if (root->subsys_mask & (1UL << i)) {
/*
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
@@ -1823,11 +1869,17 @@ int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
/*
- * Controllers from default hierarchy that need to be rebound
- * are all disabled together in one go.
+ * Controllers leaving the default hierarchy are disabled
+ * together. cgroup_rebind_ss_mask makes cgroup_apply_control()
+ * migrate their tasks to the root css, so the per-cgroup csses
+ * are unpopulated when cgroup_finalize_control() kills them.
+ * Clear it before cgroup_finalize_control(), which does no
+ * css_set lookup.
*/
cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ cgroup_rebind_ss_mask = dfl_disable_ss_mask;
WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_rebind_ss_mask = 0;
cgroup_finalize_control(scgrp, 0);
}
@@ -1841,9 +1893,14 @@ int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
WARN_ON(!css || cgroup_css(dcgrp, ss));
if (src_root != &cgrp_dfl_root) {
- /* disable from the source */
+ /*
+ * Disable from the source, migrating its tasks to the
+ * root css first (see cgroup_rebind_ss_mask).
+ */
src_root->subsys_mask &= ~(1 << ssid);
+ cgroup_rebind_ss_mask = 1 << ssid;
WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_rebind_ss_mask = 0;
cgroup_finalize_control(scgrp, 0);
}
@@ -2065,7 +2122,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
- init_waitqueue_head(&cgrp->dying_populated_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -2170,7 +2226,7 @@ int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask)
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
if (css_set_populated(cset))
- cgroup_update_populated(root_cgrp, true);
+ css_update_populated(&root_cgrp->self, true);
}
spin_unlock_irq(&css_set_lock);
@@ -3208,7 +3264,7 @@ restart:
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
DEFINE_WAIT(wait);
- if (!css || !percpu_ref_is_dying(&css->refcnt))
+ if (!css || !css_is_dying(css))
continue;
cgroup_get_live(dsct);
@@ -3375,7 +3431,9 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
- kill_css(css);
+ kill_css_sync(css);
+ if (!css_is_populated(css))
+ kill_css_finish(css);
} else if (!css_visible(css)) {
css_clear_dir(css);
if (ss->css_reset)
@@ -3703,7 +3761,7 @@ static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
if (!cgrp)
return -ENOENT;
- cgrp->max_descendants = descendants;
+ WRITE_ONCE(cgrp->max_descendants, descendants);
cgroup_kn_unlock(of->kn);
@@ -3746,7 +3804,7 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
if (!cgrp)
return -ENOENT;
- cgrp->max_depth = depth;
+ WRITE_ONCE(cgrp->max_depth, depth);
cgroup_kn_unlock(of->kn);
@@ -5067,10 +5125,12 @@ repeat:
task = list_entry(it->task_pos, struct task_struct, cg_list);
/*
- * Hide tasks that are exiting but not yet removed. Keep zombie
- * leaders with live threads visible.
+ * Hide tasks that are exiting but not yet removed by default. Keep
+ * zombie leaders with live threads visible. Usages that need to walk
+ * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD.
*/
- if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
+ if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) &&
+ (task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
goto repeat;
if (it->flags & CSS_TASK_ITER_PROCS) {
@@ -5514,7 +5574,7 @@ static struct cftype cgroup_psi_files[] = {
* css destruction is four-stage process.
*
* 1. Destruction starts. Killing of the percpu_ref is initiated.
- * Implemented in kill_css().
+ * Implemented in kill_css_finish().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
* and thus css_tryget_online() is guaranteed to fail, the css can be
@@ -5659,6 +5719,22 @@ static void css_release(struct percpu_ref *ref)
queue_work(cgroup_release_wq, &css->destroy_work);
}
+/*
+ * Deferred kill_css_finish() fired from css_update_populated() once a dying
+ * css's hierarchical populated state drops to zero. Pinned by css_get() at the
+ * queue site; matched by css_put() here.
+ */
+static void kill_css_finish_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, kill_finish_work);
+
+ cgroup_lock();
+ kill_css_finish(css);
+ cgroup_unlock();
+ css_put(css);
+}
+
static void init_and_link_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
@@ -5672,6 +5748,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
+ INIT_WORK(&css->kill_finish_work, kill_css_finish_work_fn);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -5993,7 +6070,7 @@ out_unlock:
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
- * initiate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css_finish().
*/
static void css_killed_work_fn(struct work_struct *work)
{
@@ -6026,15 +6103,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
}
/**
- * kill_css - destroy a css
- * @css: css to destroy
+ * kill_css_sync - synchronous half of css teardown
+ * @css: css being killed
*
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference. ->css_offline() will be invoked
- * asynchronously once css_tryget_online() is guaranteed to fail and when
- * the reference count reaches zero, @css will be released.
+ * See cgroup_destroy_locked().
*/
-static void kill_css(struct cgroup_subsys_state *css)
+static void kill_css_sync(struct cgroup_subsys_state *css)
{
struct cgroup_subsys *ss = css->ss;
@@ -6052,28 +6126,17 @@ static void kill_css(struct cgroup_subsys_state *css)
css->flags |= CSS_DYING;
/*
- * This must happen before css is disassociated with its cgroup.
- * See seq_css() for details.
+ * Pair with smp_mb() in css_update_populated(). Either our
+ * caller observes the walker's decrement and fires
+ * synchronously, or the walker observes CSS_DYING and queues.
*/
- css_clear_dir(css);
+ smp_mb();
/*
- * Killing would put the base ref, but we need to keep it alive
- * until after ->css_offline().
- */
- css_get(css);
-
- /*
- * cgroup core guarantees that, by the time ->css_offline() is
- * invoked, no new css reference will be given out via
- * css_tryget_online(). We can't simply call percpu_ref_kill() and
- * proceed to offlining css's because percpu_ref_kill() doesn't
- * guarantee that the ref is seen as killed on all CPUs on return.
- *
- * Use percpu_ref_kill_and_confirm() to get notifications as each
- * css is confirmed to be seen as killed on all CPUs.
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
*/
- percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
+ css_clear_dir(css);
css->cgroup->nr_dying_subsys[ss->id]++;
/*
@@ -6087,44 +6150,88 @@ static void kill_css(struct cgroup_subsys_state *css)
}
/**
- * cgroup_destroy_locked - the first stage of cgroup destruction
+ * kill_css_finish - deferred half of css teardown
+ * @css: css being killed
+ *
+ * See cgroup_destroy_locked().
+ */
+static void kill_css_finish(struct cgroup_subsys_state *css)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Skip on re-entry: cgroup_apply_control_disable() may have killed @css
+ * earlier. cgroup_destroy_locked() can still walk it because
+ * offline_css() (which NULLs cgrp->subsys[ssid]) runs async.
+ */
+ if (percpu_ref_is_dying(&css->refcnt))
+ return;
+
+ /*
+ * Killing would put the base ref, but we need to keep it alive until
+ * after ->css_offline().
+ */
+ css_get(css);
+
+ /*
+ * cgroup core guarantees that, by the time ->css_offline() is invoked,
+ * no new css reference will be given out via css_tryget_online(). We
+ * can't simply call percpu_ref_kill() and proceed to offlining css's
+ * because percpu_ref_kill() doesn't guarantee that the ref is seen as
+ * killed on all CPUs on return.
+ *
+ * Use percpu_ref_kill_and_confirm() to get notifications as each css is
+ * confirmed to be seen as killed on all CPUs.
+ */
+ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
+}
+
+/**
+ * cgroup_destroy_locked - destroy @cgrp (called on rmdir)
* @cgrp: cgroup to be destroyed
*
- * css's make use of percpu refcnts whose killing latency shouldn't be
- * exposed to userland and are RCU protected. Also, cgroup core needs to
- * guarantee that css_tryget_online() won't succeed by the time
- * ->css_offline() is invoked. To satisfy all the requirements,
- * destruction is implemented in the following two steps.
- *
- * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
- * userland visible parts and start killing the percpu refcnts of
- * css's. Set up so that the next stage will be kicked off once all
- * the percpu refcnts are confirmed to be killed.
- *
- * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
- * rest of destruction. Once all cgroup references are gone, the
- * cgroup is RCU-freed.
- *
- * This function implements s1. After this step, @cgrp is gone as far as
- * the userland is concerned and a new cgroup with the same name may be
- * created. As cgroup doesn't care about the names internally, this
- * doesn't cause any problem.
+ * Tear down @cgrp on behalf of rmdir. Constraints:
+ *
+ * - Userspace: rmdir must succeed when cgroup.procs and friends are empty.
+ *
+ * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's
+ * subtree is still doing kernel work. A task hidden from cgroup.procs (past
+ * exit_signals() with signal->live cleared) can still schedule, allocate, and
+ * consume resources until its final context switch. Dying descendants in the
+ * subtree can host such tasks too.
+ *
+ * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs.
+ *
+ * The destruction runs in three parts:
+ *
+ * - This function: synchronous user-visible state teardown plus kill_css_sync()
+ * on each subsystem css.
+ *
+ * - For each subsys css: fire kill_css_finish() synchronously if the subtree is
+ * already drained, otherwise rely on css_update_populated() to queue
+ * kill_finish_work when the last populated cset under the css empties.
+ *
+ * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
+ * ->css_offline() -> release/free.
+ *
+ * Return 0 on success, -EBUSY if a userspace-visible task or an online child
+ * remains.
*/
static int cgroup_destroy_locked(struct cgroup *cgrp)
- __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
- /*
- * Only migration can raise populated from zero and we're already
- * holding cgroup_mutex.
- */
- if (cgroup_is_populated(cgrp))
+ css_task_iter_start(&cgrp->self, 0, &it);
+ task = css_task_iter_next(&it);
+ css_task_iter_end(&it);
+ if (task)
return -EBUSY;
/*
@@ -6148,9 +6255,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
link->cset->dead = true;
spin_unlock_irq(&css_set_lock);
- /* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
- kill_css(css);
+ kill_css_sync(css);
/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
css_clear_dir(&cgrp->self);
@@ -6181,81 +6287,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
+ for_each_css(css, ssid, cgrp) {
+ if (!css_is_populated(css))
+ kill_css_finish(css);
+ }
+
return 0;
};
-/**
- * cgroup_drain_dying - wait for dying tasks to leave before rmdir
- * @cgrp: the cgroup being removed
- *
- * cgroup.procs and cgroup.threads use css_task_iter which filters out
- * PF_EXITING tasks so that userspace doesn't see tasks that have already been
- * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
- * cgroup has non-empty css_sets - is only updated when dying tasks pass through
- * cgroup_task_dead() in finish_task_switch(). This creates a window where
- * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
- * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
- * tasks.
- *
- * This function aligns cgroup_has_tasks() with what userspace can observe. If
- * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
- * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
- * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
- *
- * This function only concerns itself with this cgroup's own dying tasks.
- * Whether the cgroup has children is cgroup_destroy_locked()'s problem.
- *
- * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
- * retry the full check from scratch.
- *
- * Must be called with cgroup_mutex held.
- */
-static int cgroup_drain_dying(struct cgroup *cgrp)
- __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
-{
- struct css_task_iter it;
- struct task_struct *task;
- DEFINE_WAIT(wait);
-
- lockdep_assert_held(&cgroup_mutex);
-retry:
- if (!cgroup_has_tasks(cgrp))
- return 0;
-
- /* Same iterator as cgroup.threads - if any task is visible, it's busy */
- css_task_iter_start(&cgrp->self, 0, &it);
- task = css_task_iter_next(&it);
- css_task_iter_end(&it);
-
- if (task)
- return -EBUSY;
-
- /*
- * All remaining tasks are PF_EXITING and will pass through
- * cgroup_task_dead() shortly. Wait for a kick and retry.
- *
- * cgroup_has_tasks() can't transition from false to true while we're
- * holding cgroup_mutex, but the true to false transition happens
- * under css_set_lock (via cgroup_task_dead()). We must retest and
- * prepare_to_wait() under css_set_lock. Otherwise, the transition
- * can happen between our first test and prepare_to_wait(), and we
- * sleep with no one to wake us.
- */
- spin_lock_irq(&css_set_lock);
- if (!cgroup_has_tasks(cgrp)) {
- spin_unlock_irq(&css_set_lock);
- return 0;
- }
- prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
- schedule();
- finish_wait(&cgrp->dying_populated_waitq, &wait);
- mutex_lock(&cgroup_mutex);
- goto retry;
-}
-
int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
@@ -6265,12 +6304,9 @@ int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;
- ret = cgroup_drain_dying(cgrp);
- if (!ret) {
- ret = cgroup_destroy_locked(cgrp);
- if (!ret)
- TRACE_CGROUP_PATH(rmdir, cgrp);
- }
+ ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ TRACE_CGROUP_PATH(rmdir, cgrp);
cgroup_kn_unlock(kn);
return ret;
@@ -7030,7 +7066,6 @@ void cgroup_task_exit(struct task_struct *tsk)
static void do_cgroup_task_dead(struct task_struct *tsk)
{
- struct cgrp_cset_link *link;
struct css_set *cset;
unsigned long flags;
@@ -7044,11 +7079,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
- /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
- if (waitqueue_active(&link->cgrp->dying_populated_waitq))
- wake_up(&link->cgrp->dying_populated_waitq);
-
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index bb4e692bea30..f7aaf01f7cd5 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -167,6 +167,7 @@ struct cpuset {
*/
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
+ /* DL bandwidth that needs destination reservation for this attach. */
u64 sum_migrate_dl_bw;
/*
* CPU used for temporary DL bandwidth allocation during attach;
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 7308e9b02495..3e9968dd91e9 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -312,7 +312,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
* This is full cgroup operation which will also call back into
* cpuset. Execute it asynchronously using workqueue.
*/
- if (is_empty && cs->css.cgroup->nr_populated_csets &&
+ if (is_empty && cgroup_has_tasks(cs->css.cgroup) &&
css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e3a081a07c6d..591e3aa487fc 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -432,7 +432,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
* nr_populated_domain_children may include populated
* csets from descendants that are partitions.
*/
- if (cs->css.cgroup->nr_populated_csets ||
+ if (cgroup_has_tasks(cs->css.cgroup) ||
cs->attach_in_progress)
return true;
@@ -1004,8 +1004,11 @@ void rebuild_sched_domains_locked(void)
* prevent the panic.
*/
for (i = 0; doms && i < ndoms; i++) {
- if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
+ if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) {
+ free_sched_domains(doms, ndoms);
+ kfree(attr);
return;
+ }
}
/* Have scheduler rebuild the domains */
@@ -1718,7 +1721,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*/
if (is_partition_valid(parent))
adding = cpumask_and(tmp->addmask,
- xcpus, parent->effective_xcpus);
+ cs->effective_xcpus,
+ parent->effective_xcpus);
if (old_prs > 0)
new_prs = -old_prs;
@@ -1810,9 +1814,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* Compute add/delete mask to/from effective_cpus
*
* For valid partition:
- * addmask = exclusive_cpus & ~newmask
+ * addmask = effective_xcpus & ~newmask
* & parent->effective_xcpus
- * delmask = newmask & ~exclusive_cpus
+ * delmask = newmask & ~effective_xcpus
* & parent->effective_xcpus
*
* For invalid partition:
@@ -1824,11 +1828,11 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
deleting = cpumask_and(tmp->delmask,
newmask, parent->effective_xcpus);
} else {
- cpumask_andnot(tmp->addmask, xcpus, newmask);
+ cpumask_andnot(tmp->addmask, cs->effective_xcpus, newmask);
adding = cpumask_and(tmp->addmask, tmp->addmask,
parent->effective_xcpus);
- cpumask_andnot(tmp->delmask, newmask, xcpus);
+ cpumask_andnot(tmp->delmask, newmask, cs->effective_xcpus);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->effective_xcpus);
}
@@ -1867,7 +1871,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
part_error = PERR_NOCPUS;
deleting = false;
adding = cpumask_and(tmp->addmask,
- xcpus, parent->effective_xcpus);
+ cs->effective_xcpus, parent->effective_xcpus);
}
} else {
/*
@@ -1889,7 +1893,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
part_error = PERR_NOCPUS;
if (is_partition_valid(cs))
adding = cpumask_and(tmp->addmask,
- xcpus, parent->effective_xcpus);
+ cs->effective_xcpus,
+ parent->effective_xcpus);
} else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
cpumask_subset(xcpus, parent->effective_xcpus)) {
struct cgroup_subsys_state *css;
@@ -2993,7 +2998,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cpuset *cs, *oldcs;
struct task_struct *task;
bool setsched_check;
- int ret;
+ int cpu, ret;
/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
@@ -3038,31 +3043,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
}
if (dl_task(task)) {
+ /*
+ * Count all migrating DL tasks for cpuset task accounting.
+ * Only tasks that need a root-domain bandwidth move
+ * contribute to sum_migrate_dl_bw.
+ */
cs->nr_migrate_dl_tasks++;
- cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ if (dl_task_needs_bw_move(task, cs->effective_cpus))
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
}
}
- if (!cs->nr_migrate_dl_tasks)
+ if (!cs->sum_migrate_dl_bw)
goto out_success;
- if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
- int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
-
- if (unlikely(cpu >= nr_cpu_ids)) {
- reset_migrate_dl_data(cs);
- ret = -EINVAL;
- goto out_unlock;
- }
+ cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
- if (ret) {
- reset_migrate_dl_data(cs);
- goto out_unlock;
- }
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret)
+ goto out_unlock;
- cs->dl_bw_cpu = cpu;
- }
+ cs->dl_bw_cpu = cpu;
out_success:
/*
@@ -3070,7 +3075,10 @@ out_success:
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
+
out_unlock:
+ if (ret)
+ reset_migrate_dl_data(cs);
mutex_unlock(&cpuset_mutex);
return ret;
}
@@ -4176,11 +4184,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
* yes. If current has access to memory reserves as an oom victim, yes.
- * Otherwise, no.
+ * If the current task is PF_EXITING, yes. Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed.
+ * unless the task has been OOM killed or is exiting.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -4194,7 +4202,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
* so no allocation on a node outside the cpuset is allowed (unless
- * in interrupt, of course).
+ * in interrupt, of course). The PF_EXITING check must therefore
+ * come before the __GFP_HARDWALL check, otherwise a dying task
+ * would be blocked on the fast path.
*
* The second pass through get_page_from_freelist() doesn't even call
* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
@@ -4204,6 +4214,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
* tsk_is_oom_victim - any node ok
+ * PF_EXITING - any node ok (let dying task exit quickly)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -4223,10 +4234,12 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
*/
if (unlikely(tsk_is_oom_victim(current)))
return true;
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
- if (current->flags & PF_EXITING) /* Let dying task have memory */
+ if (cpuset_v2())
return true;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 1ab1fb47f271..4753a67d0f0f 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -602,6 +602,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
pool = NULL;
continue;
}
+ pool = ERR_PTR(-ENOMEM);
}
}
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 4fdab4cf49e0..5e82a03b3270 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -9,6 +9,7 @@
*/
#include <linux/bitops.h>
+#include <linux/limits.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/cgroup.h>
@@ -17,6 +18,22 @@
#define RDMACG_MAX_STR "max"
+enum rdmacg_limit_tokens {
+ RDMACG_HCA_HANDLE_VAL,
+ RDMACG_HCA_HANDLE_MAX,
+ RDMACG_HCA_OBJECT_VAL,
+ RDMACG_HCA_OBJECT_MAX,
+ NR_RDMACG_LIMIT_TOKENS,
+};
+
+static const match_table_t rdmacg_limit_tokens = {
+ { RDMACG_HCA_HANDLE_VAL, "hca_handle=%d" },
+ { RDMACG_HCA_HANDLE_MAX, "hca_handle=max" },
+ { RDMACG_HCA_OBJECT_VAL, "hca_object=%d" },
+ { RDMACG_HCA_OBJECT_MAX, "hca_object=max" },
+ { NR_RDMACG_LIMIT_TOKENS, NULL },
+};
+
/*
* Protects list of resource pools maintained on per cgroup basis
* and rdma device list.
@@ -27,6 +44,7 @@ static LIST_HEAD(rdmacg_devices);
enum rdmacg_file_type {
RDMACG_RESOURCE_TYPE_MAX,
RDMACG_RESOURCE_TYPE_STAT,
+ RDMACG_RESOURCE_TYPE_PEAK,
};
/*
@@ -43,6 +61,7 @@ static char const *rdmacg_resource_names[] = {
struct rdmacg_resource {
int max;
int usage;
+ int peak;
};
/*
@@ -62,6 +81,12 @@ struct rdmacg_resource_pool {
u64 usage_sum;
/* total number counts which are set to max */
int num_max_cnt;
+
+ /* per-resource event counters */
+ u64 events_max[RDMACG_RESOURCE_MAX];
+ u64 events_alloc_fail[RDMACG_RESOURCE_MAX];
+ u64 events_local_max[RDMACG_RESOURCE_MAX];
+ u64 events_local_alloc_fail[RDMACG_RESOURCE_MAX];
};
static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
@@ -109,6 +134,26 @@ static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
kfree(rpool);
}
+static bool rpool_has_persistent_state(struct rdmacg_resource_pool *rpool)
+{
+ int i;
+
+ /*
+ * Keep the rpool alive if any peak value is non-zero,
+ * so that rdma.peak persists as a historical high-
+ * watermark even after all resources are freed.
+ */
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ if (rpool->resources[i].peak ||
+ rpool->events_max[i] ||
+ rpool->events_local_max[i] ||
+ rpool->events_alloc_fail[i] ||
+ rpool->events_local_alloc_fail[i])
+ return true;
+ }
+ return false;
+}
+
static struct rdmacg_resource_pool *
find_cg_rpool_locked(struct rdma_cgroup *cg,
struct rdmacg_device *device)
@@ -187,11 +232,67 @@ uncharge_cg_locked(struct rdma_cgroup *cg,
rpool->usage_sum--;
if (rpool->usage_sum == 0 &&
rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
- /*
- * No user of the rpool and all entries are set to max, so
- * safe to delete this rpool.
- */
- free_cg_rpool_locked(rpool);
+ if (!rpool_has_persistent_state(rpool)) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+ }
+}
+
+/**
+ * rdmacg_event_locked - fire event when resource allocation exceeds limit
+ * @cg: requesting cgroup
+ * @over_cg: cgroup whose limit was exceeded
+ * @device: rdma device
+ * @index: resource type index
+ *
+ * Must be called under rdmacg_mutex. Updates event counters in the
+ * resource pools of @cg and @over_cg, propagates hierarchical max
+ * events from @over_cg (including itself) upward, and notifies
+ * userspace via cgroup_file_notify().
+ */
+static void rdmacg_event_locked(struct rdma_cgroup *cg,
+ struct rdma_cgroup *over_cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdmacg_resource_pool *rpool;
+ struct rdma_cgroup *p;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ /* Increment local alloc_fail in requesting cgroup */
+ rpool = find_cg_rpool_locked(cg, device);
+ if (rpool) {
+ rpool->events_local_alloc_fail[index]++;
+ cgroup_file_notify(&cg->events_local_file);
+ }
+
+ /* Increment local max in the over-limit cgroup */
+ rpool = find_cg_rpool_locked(over_cg, device);
+ if (rpool) {
+ rpool->events_local_max[index]++;
+ cgroup_file_notify(&over_cg->events_local_file);
+ }
+
+ /* Propagate hierarchical max events upward */
+ for (p = over_cg; parent_rdmacg(p); p = parent_rdmacg(p)) {
+ rpool = get_cg_rpool_locked(p, device);
+ if (!IS_ERR(rpool)) {
+ rpool->events_max[index]++;
+ cgroup_file_notify(&p->events_file);
+ }
+ }
+ /* Propagate hierarchical alloc_fail from requesting cgroup upward */
+ for (p = cg; parent_rdmacg(p); p = parent_rdmacg(p)) {
+ rpool = get_cg_rpool_locked(p, device);
+ if (!IS_ERR(rpool)) {
+ rpool->events_alloc_fail[index]++;
+ cgroup_file_notify(&p->events_file);
+ }
}
}
@@ -293,12 +394,20 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
}
}
}
+ /* Update peak only after all charges succeed */
+ for (p = cg; p; p = parent_rdmacg(p)) {
+ rpool = find_cg_rpool_locked(p, device);
+ if (rpool && rpool->resources[index].usage > rpool->resources[index].peak)
+ rpool->resources[index].peak = rpool->resources[index].usage;
+ }
mutex_unlock(&rdmacg_mutex);
*rdmacg = cg;
return 0;
err:
+ if (ret == -EAGAIN)
+ rdmacg_event_locked(cg, p, device, index);
mutex_unlock(&rdmacg_mutex);
rdmacg_uncharge_hierarchy(cg, device, p, index);
return ret;
@@ -355,62 +464,6 @@ void rdmacg_unregister_device(struct rdmacg_device *device)
}
EXPORT_SYMBOL(rdmacg_unregister_device);
-static int parse_resource(char *c, int *intval)
-{
- substring_t argstr;
- char *name, *value = c;
- size_t len;
- int ret, i;
-
- name = strsep(&value, "=");
- if (!name || !value)
- return -EINVAL;
-
- i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
- if (i < 0)
- return i;
-
- len = strlen(value);
-
- argstr.from = value;
- argstr.to = value + len;
-
- ret = match_int(&argstr, intval);
- if (ret >= 0) {
- if (*intval < 0)
- return -EINVAL;
- return i;
- }
- if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
- *intval = S32_MAX;
- return i;
- }
- return -EINVAL;
-}
-
-static int rdmacg_parse_limits(char *options,
- int *new_limits, unsigned long *enables)
-{
- char *c;
- int err = -EINVAL;
-
- /* parse resource options */
- while ((c = strsep(&options, " ")) != NULL) {
- int index, intval;
-
- index = parse_resource(c, &intval);
- if (index < 0)
- goto err;
-
- new_limits[index] = intval;
- *enables |= BIT(index);
- }
- return 0;
-
-err:
- return err;
-}
-
static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
{
struct rdmacg_device *device;
@@ -432,6 +485,7 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
struct rdmacg_resource_pool *rpool;
struct rdmacg_device *device;
char *options = strstrip(buf);
+ char *p;
int *new_limits;
unsigned long enables = 0;
int i = 0, ret = 0;
@@ -449,9 +503,45 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
goto err;
}
- ret = rdmacg_parse_limits(options, new_limits, &enables);
- if (ret)
- goto parse_err;
+ /* parse resource limit tokens */
+ while ((p = strsep(&options, " \t\n"))) {
+ substring_t args[MAX_OPT_ARGS];
+ int tok, intval;
+
+ if (!*p)
+ continue;
+
+ tok = match_token(p, rdmacg_limit_tokens, args);
+ switch (tok) {
+ case RDMACG_HCA_HANDLE_VAL:
+ if (match_int(&args[0], &intval) || intval < 0) {
+ ret = -EINVAL;
+ goto parse_err;
+ }
+ new_limits[RDMACG_RESOURCE_HCA_HANDLE] = intval;
+ enables |= BIT(RDMACG_RESOURCE_HCA_HANDLE);
+ break;
+ case RDMACG_HCA_HANDLE_MAX:
+ new_limits[RDMACG_RESOURCE_HCA_HANDLE] = S32_MAX;
+ enables |= BIT(RDMACG_RESOURCE_HCA_HANDLE);
+ break;
+ case RDMACG_HCA_OBJECT_VAL:
+ if (match_int(&args[0], &intval) || intval < 0) {
+ ret = -EINVAL;
+ goto parse_err;
+ }
+ new_limits[RDMACG_RESOURCE_HCA_OBJECT] = intval;
+ enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT);
+ break;
+ case RDMACG_HCA_OBJECT_MAX:
+ new_limits[RDMACG_RESOURCE_HCA_OBJECT] = S32_MAX;
+ enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT);
+ break;
+ default:
+ ret = -EINVAL;
+ goto parse_err;
+ }
+ }
/* acquire lock to synchronize with hot plug devices */
mutex_lock(&rdmacg_mutex);
@@ -474,11 +564,13 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
if (rpool->usage_sum == 0 &&
rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
- /*
- * No user of the rpool and all entries are set to max, so
- * safe to delete this rpool.
- */
- free_cg_rpool_locked(rpool);
+ if (!rpool_has_persistent_state(rpool)) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
}
dev_err:
@@ -508,6 +600,8 @@ static void print_rpool_values(struct seq_file *sf,
value = rpool->resources[i].max;
else
value = S32_MAX;
+ } else if (sf_type == RDMACG_RESOURCE_TYPE_PEAK) {
+ value = rpool ? rpool->resources[i].peak : 0;
} else {
if (rpool)
value = rpool->resources[i].usage;
@@ -544,6 +638,64 @@ static int rdmacg_resource_read(struct seq_file *sf, void *v)
return 0;
}
+static int rdmacg_events_show(struct seq_file *sf, void *v)
+{
+ struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+ struct rdmacg_resource_pool *rpool;
+ struct rdmacg_device *device;
+ int i;
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node) {
+ rpool = find_cg_rpool_locked(cg, device);
+
+ seq_printf(sf, "%s ", device->name);
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ seq_printf(sf, "%s.max=%llu %s.alloc_fail=%llu",
+ rdmacg_resource_names[i],
+ rpool ? rpool->events_max[i] : 0ULL,
+ rdmacg_resource_names[i],
+ rpool ? rpool->events_alloc_fail[i] : 0ULL);
+ if (i < RDMACG_RESOURCE_MAX - 1)
+ seq_putc(sf, ' ');
+ }
+ seq_putc(sf, '\n');
+ }
+
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+
+static int rdmacg_events_local_show(struct seq_file *sf, void *v)
+{
+ struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+ struct rdmacg_resource_pool *rpool;
+ struct rdmacg_device *device;
+ int i;
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node) {
+ rpool = find_cg_rpool_locked(cg, device);
+
+ seq_printf(sf, "%s ", device->name);
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ seq_printf(sf, "%s.max=%llu %s.alloc_fail=%llu",
+ rdmacg_resource_names[i],
+ rpool ? rpool->events_local_max[i] : 0ULL,
+ rdmacg_resource_names[i],
+ rpool ? rpool->events_local_alloc_fail[i] : 0ULL);
+ if (i < RDMACG_RESOURCE_MAX - 1)
+ seq_putc(sf, ' ');
+ }
+ seq_putc(sf, '\n');
+ }
+
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+
static struct cftype rdmacg_files[] = {
{
.name = "max",
@@ -558,6 +710,24 @@ static struct cftype rdmacg_files[] = {
.private = RDMACG_RESOURCE_TYPE_STAT,
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "peak",
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_PEAK,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "events",
+ .seq_show = rdmacg_events_show,
+ .file_offset = offsetof(struct rdma_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "events.local",
+ .seq_show = rdmacg_events_local_show,
+ .file_offset = offsetof(struct rdma_cgroup, events_local_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
@@ -577,6 +747,13 @@ rdmacg_css_alloc(struct cgroup_subsys_state *parent)
static void rdmacg_css_free(struct cgroup_subsys_state *css)
{
struct rdma_cgroup *cg = css_rdmacg(css);
+ struct rdmacg_resource_pool *rpool, *tmp;
+
+ /* Clean up rpools kept alive by non-zero peak values */
+ mutex_lock(&rdmacg_mutex);
+ list_for_each_entry_safe(rpool, tmp, &cg->rpools, cg_node)
+ free_cg_rpool_locked(rpool);
+ mutex_unlock(&rdmacg_mutex);
kfree(cg);
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 150e5871e66f..de816a43db9f 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
+#include <linux/cpumask.h>
#include <linux/sched/cputime.h>
#include <linux/bpf.h>
@@ -53,7 +54,7 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
}
/**
- * css_rstat_updated - keep track of updated rstat_cpu
+ * __css_rstat_updated - keep track of updated rstat_cpu
* @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
@@ -63,31 +64,27 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
*
* NOTE: if the user needs the guarantee that the updater either add itself in
* the lockless list or the concurrent flusher flushes its updated stats, a
- * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * memory barrier is needed before the call to __css_rstat_updated() i.e. a
* barrier after updating the per-cpu stats and before calling
- * css_rstat_updated().
+ * __css_rstat_updated().
*/
-__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
struct llist_head *lhead;
struct css_rstat_cpu *rstatc;
struct llist_node *self;
- /*
- * Since bpf programs can call this function, prevent access to
- * uninitialized rstat pointers.
- */
+ /* Prevent access to uninitialized rstat pointers. */
if (!css_uses_rstat(css))
return;
lockdep_assert_preemption_disabled();
/*
- * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
- * the requests from nmi context.
+ * The lockless insertion below relies on NMI-safe cmpxchg;
+ * bail out in NMI on archs that don't provide it.
*/
- if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
- !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && in_nmi())
return;
rstatc = css_rstat_cpu(css, cpu);
@@ -125,6 +122,18 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
llist_add(&rstatc->lnode, lhead);
}
+/*
+ * BPF-facing wrapper for __css_rstat_updated(). Validate the caller-provided
+ * CPU before passing it to the internal rstat updater.
+ */
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
+ if (unlikely(cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu)))
+ return;
+
+ __css_rstat_updated(css, cpu);
+}
+
static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
{
/* put @css and all ancestors on the corresponding updated lists */
@@ -170,7 +179,7 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
* flusher flush the stats updated by the updater who have
* observed that they are already on the list. The
* corresponding barrier pair for this one should be before
- * css_rstat_updated() by the user.
+ * __css_rstat_updated() by the user.
*
* For now, there aren't any such user, so not adding the
* barrier here but if such a use-case arise, please add
@@ -614,7 +623,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
unsigned long flags)
{
u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
- css_rstat_updated(&cgrp->self, smp_processor_id());
+ __css_rstat_updated(&cgrp->self, smp_processor_id());
put_cpu_ptr(rstatbc);
}
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 7c3924614e01..26831a2a5739 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -22,7 +22,7 @@ CONFIG_SLAB_FREELIST_RANDOM=y
CONFIG_SLAB_FREELIST_HARDENED=y
CONFIG_SLAB_BUCKETS=y
CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
-CONFIG_RANDOM_KMALLOC_CACHES=y
+CONFIG_KMALLOC_PARTITION_CACHES=y
# Sanity check userspace page table mappings.
CONFIG_PAGE_TABLE_CHECK=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bc4f7a9ba64e..f975bb34915b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2639,7 +2639,7 @@ static void cpuhp_offline_cpu_device(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
- dev->offline = true;
+ dev_set_offline(dev);
/* Tell user space about the state change */
kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
}
@@ -2648,7 +2648,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
- dev->offline = false;
+ dev_clear_offline(dev);
/* Tell user space about the state change */
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 12a7b1ce5131..3df4e15bd67f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -384,8 +384,9 @@ int commit_creds(struct cred *new)
!uid_eq(old->fsuid, new->fsuid) ||
!gid_eq(old->fsgid, new->fsgid) ||
!cred_cap_issubset(old, new)) {
+ /* mm-less tasks share init_task's exec_state */
if (task->mm)
- set_dumpable(task->mm, suid_dumpable);
+ task_exec_state_set_dumpable(suid_dumpable);
task->pdeath_signal = 0;
/*
* If a task drops privileges and becomes nondumpable,
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 1a725edbbbf6..2c0e2cd89b5e 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1251,7 +1251,14 @@ void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- if (!(attrs & DMA_ATTR_MMIO)) {
+ if (attrs & DMA_ATTR_MMIO) {
+ unsigned long pfn = PHYS_PFN(phys);
+
+ if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
+ err_printk(dev, entry,
+ "dma_map_resource called for RAM address %pa\n",
+ &phys);
+ } else {
check_for_stack(dev, phys);
if (!PhysHighMem(phys))
@@ -1549,7 +1556,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .paddr = sg_phys(sg),
+ .paddr = sg_phys(s),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index ec887f443741..4391b797d4db 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -39,7 +39,7 @@ static inline struct page *dma_direct_to_page(struct device *dev,
u64 dma_direct_get_required_mask(struct device *dev)
{
- phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
+ phys_addr_t phys = ((phys_addr_t)max_pfn << PAGE_SHIFT) - 1;
u64 max_dma = phys_to_dma_direct(dev, phys);
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
@@ -476,7 +476,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
* must be mapped with CPU physical address and not PCI
* bus addresses.
*/
- break;
+ fallthrough;
case PCI_P2PDMA_MAP_NONE:
need_sync = true;
sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
@@ -553,7 +553,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
int dma_direct_supported(struct device *dev, u64 mask)
{
- u64 min_mask = (max_pfn - 1) << PAGE_SHIFT;
+ u64 min_mask = ((u64)max_pfn << PAGE_SHIFT) - 1;
/*
* Because 32-bit DMA masks are so common we expect every architecture
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 23ed8eb9233e..4eedb1a6273a 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -126,11 +126,9 @@ static bool dma_go_direct(struct device *dev, dma_addr_t mask,
if (likely(!ops))
return true;
-#ifdef CONFIG_DMA_OPS_BYPASS
- if (dev->dma_ops_bypass)
+ if (IS_ENABLED(CONFIG_DMA_OPS_BYPASS) && dev_dma_ops_bypass(dev))
return min_not_zero(mask, dev->bus_dma_limit) >=
dma_direct_get_required_mask(dev);
-#endif
return false;
}
@@ -365,10 +363,6 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs);
dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
- WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
- return DMA_MAPPING_ERROR;
-
return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO);
}
EXPORT_SYMBOL(dma_map_resource);
@@ -476,7 +470,7 @@ bool dma_need_unmap(struct device *dev)
{
if (!dma_map_direct(dev, get_dma_ops(dev)))
return true;
- if (!dev->dma_skip_sync)
+ if (!dev_dma_skip_sync(dev))
return true;
return IS_ENABLED(CONFIG_DMA_API_DEBUG);
}
@@ -492,16 +486,16 @@ static void dma_setup_need_sync(struct device *dev)
* mapping, if any. During the device initialization, it's
* enough to check only for the DMA coherence.
*/
- dev->dma_skip_sync = dev_is_dma_coherent(dev);
+ dev_assign_dma_skip_sync(dev, dev_is_dma_coherent(dev));
else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu &&
!ops->sync_sg_for_device && !ops->sync_sg_for_cpu)
/*
* Synchronization is not possible when none of DMA sync ops
* is set.
*/
- dev->dma_skip_sync = true;
+ dev_set_dma_skip_sync(dev);
else
- dev->dma_skip_sync = false;
+ dev_clear_dma_skip_sync(dev);
}
#else /* !CONFIG_DMA_NEED_SYNC */
static inline void dma_setup_need_sync(struct device *dev) { }
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 19d2244a9fef..e3d381fd3d25 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -1,11 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/irq-entry-common.h>
-#include <linux/resume_user_mode.h>
+#include <linux/futex.h>
#include <linux/highmem.h>
+#include <linux/irq-entry-common.h>
#include <linux/jump_label.h>
#include <linux/kmsan.h>
#include <linux/livepatch.h>
+#include <linux/resume_user_mode.h>
#include <linux/tick.h>
/* Workaround to allow gradual conversion of architecture code */
@@ -60,8 +61,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
- if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+ futex_fixup_robust_unlock(regs);
arch_do_signal_or_restart(regs);
+ }
if (ti_work & _TIF_NOTIFY_RESUME)
resume_user_mode_work(regs);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6d1f8bad7e1c..b1e1c5f0c7ba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -58,6 +58,7 @@
#include <linux/percpu-rwsem.h>
#include <linux/unwind_deferred.h>
#include <linux/kvm_types.h>
+#include <linux/seq_file.h>
#include "internal.h"
@@ -7006,6 +7007,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
}
static void perf_pmu_output_stop(struct perf_event *event);
+static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb);
/*
* A buffer can be mmap()ed multiple times; either directly through the same
@@ -7021,8 +7023,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
- int mmap_locked = rb->mmap_locked;
- unsigned long size = perf_data_size(rb);
bool detach_rest = false;
/* FIXIES vs perf_pmu_unregister() */
@@ -7117,11 +7117,7 @@ again:
* Aside from that, this buffer is 'fully' detached and unmapped,
* undo the VM accounting.
*/
-
- atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
- &mmap_user->locked_vm);
- atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
- free_uid(mmap_user);
+ perf_mmap_unaccount(vma, rb);
out_put:
ring_buffer_put(rb); /* could be last */
@@ -7261,6 +7257,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long
atomic64_add(extra, &vma->vm_mm->pinned_vm);
}
+static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb)
+{
+ struct user_struct *user = rb->mmap_user;
+
+ atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked,
+ &user->locked_vm);
+ atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm);
+}
+
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
@@ -7323,8 +7328,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
if (!rb)
return -ENOMEM;
- refcount_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
@@ -7474,16 +7477,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
mapped(event, vma->vm_mm);
/*
- * Try to map it into the page table. On fail, invoke
- * perf_mmap_close() to undo the above, as the callsite expects
- * full cleanup in this case and therefore does not invoke
- * vmops::close().
+ * Try to map it into the page table. On fail undo the above,
+ * as the callsite expects full cleanup in this case and
+ * therefore does not invoke vmops::close().
*/
ret = map_range(event->rb, vma);
- if (ret)
- perf_mmap_close(vma);
+ if (likely(!ret))
+ return 0;
+
+ /* Error path */
+
+ /*
+ * If this is the first mmap(), then event->mmap_count should
+ * be stable at 1. It is only modified by:
+ * perf_mmap_{open,close}() and perf_mmap().
+ *
+ * The former are not possible because this mmap() hasn't been
+ * successful yet, and the latter is serialized by
+ * event->mmap_mutex which we still hold (note that mmap_lock
+ * is not strictly sufficient here, because the event fd can
+ * be passed to another process through trivial means like
+ * fork(), leading to concurrent mmap() from different mm).
+ *
+ * Make sure to remove event->rb before releasing
+ * event->mmap_mutex, such that any concurrent mmap() will not
+ * attempt use this failed buffer.
+ */
+ if (refcount_read(&event->mmap_count) == 1) {
+ /*
+ * Minimal perf_mmap_close(); there can't be AUX or
+ * other events on account of this being the first.
+ */
+ mapped = get_mapped(event, event_unmapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
+ perf_mmap_unaccount(vma, event->rb);
+ ring_buffer_attach(event, NULL); /* drops last rb->refcount */
+ refcount_set(&event->mmap_count, 0);
+ return ret;
+ }
+
+ /*
+ * Otherwise this is an already existing buffer, and there is
+ * no race vs first exposure, so fall-through and call
+ * perf_mmap_close().
+ */
}
+ perf_mmap_close(vma);
return ret;
}
@@ -7506,6 +7547,33 @@ static int perf_fasync(int fd, struct file *filp, int on)
return 0;
}
+static void perf_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct perf_event *event = f->private_data;
+ struct perf_event_context *ctx;
+ struct mutex *child_mutex;
+
+ ctx = perf_event_ctx_lock(event);
+ child_mutex = event->parent ? &event->parent->child_mutex : &event->child_mutex;
+ mutex_lock(child_mutex);
+
+ seq_printf(m, "perf_event_attr.type:\t%u\n", event->orig_type);
+ if (event->pmu)
+ seq_printf(m, "pmu_type:\t%u\n", event->pmu->type);
+ seq_printf(m, "perf_event_attr.config:\t0x%llx\n", (unsigned long long)event->attr.config);
+ seq_printf(m, "perf_event_attr.config1:\t0x%llx\n",
+ (unsigned long long)event->attr.config1);
+ seq_printf(m, "perf_event_attr.config2:\t0x%llx\n",
+ (unsigned long long)event->attr.config2);
+ seq_printf(m, "perf_event_attr.config3:\t0x%llx\n",
+ (unsigned long long)event->attr.config3);
+ seq_printf(m, "perf_event_attr.config4:\t0x%llx\n",
+ (unsigned long long)event->attr.config4);
+
+ mutex_unlock(child_mutex);
+ perf_event_ctx_unlock(event, ctx);
+}
+
static const struct file_operations perf_fops = {
.release = perf_release,
.read = perf_read,
@@ -7514,6 +7582,7 @@ static const struct file_operations perf_fops = {
.compat_ioctl = perf_compat_ioctl,
.mmap = perf_mmap,
.fasync = perf_fasync,
+ .show_fdinfo = perf_show_fdinfo,
};
/*
@@ -11643,6 +11712,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event,
/* only uprobe programs are allowed to be sleepable */
return -EINVAL;
+ if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) {
+ /*
+ * Sleepable tracepoint programs can only attach to faultable
+ * tracepoints. Currently only syscall tracepoints are faultable.
+ */
+ if (!is_syscall_tp)
+ return -EINVAL;
+ }
+
/* Kprobe override only works for kprobes, not uprobes. */
if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d9cc57083091..c03c4f2eea57 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
struct perf_buffer *rb;
rb = container_of(rcu_head, struct perf_buffer, rcu_head);
+ free_uid(rb->mmap_user);
rb_free(rb);
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 3e7de2661417..9fe92161715e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
rb->paused = 1;
mutex_init(&rb->aux_mutex);
+ rb->mmap_user = get_current_user();
+ refcount_set(&rb->mmap_count, 1);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
diff --git a/kernel/exec_state.c b/kernel/exec_state.c
new file mode 100644
index 000000000000..6034f4b4808f
--- /dev/null
+++ b/kernel/exec_state.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+#include <linux/sched.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/exec_state.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+
+static struct kmem_cache *task_exec_state_cachep;
+
+static void __free_task_exec_state(struct rcu_head *rcu)
+{
+ struct task_exec_state *exec_state = container_of(rcu, struct task_exec_state, rcu);
+
+ put_user_ns(exec_state->user_ns);
+ kmem_cache_free(task_exec_state_cachep, exec_state);
+}
+
+void put_task_exec_state(struct task_exec_state *exec_state)
+{
+ if (exec_state && refcount_dec_and_test(&exec_state->count))
+ call_rcu(&exec_state->rcu, __free_task_exec_state);
+}
+
+struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns)
+{
+ struct task_exec_state *exec_state;
+
+ exec_state = kmem_cache_alloc(task_exec_state_cachep, GFP_KERNEL);
+ if (!exec_state)
+ return NULL;
+ refcount_set(&exec_state->count, 1);
+ exec_state->dumpable = TASK_DUMPABLE_OFF;
+ exec_state->user_ns = get_user_ns(user_ns);
+ return exec_state;
+}
+
+struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk)
+{
+ struct task_exec_state *exec_state;
+
+ exec_state = rcu_dereference_check(tsk->exec_state,
+ lockdep_is_held(&tsk->alloc_lock));
+ WARN_ON_ONCE(!exec_state);
+ return exec_state;
+}
+
+struct task_exec_state *task_exec_state_replace(struct task_struct *tsk,
+ struct task_exec_state *exec_state)
+{
+ /*
+ * Updates must hold both locks so callers needing a consistent
+ * snapshot of mm + dumpability are covered.
+ */
+ lockdep_assert_held(&tsk->alloc_lock);
+ lockdep_assert_held_write(&tsk->signal->exec_update_lock);
+
+ return rcu_replace_pointer(tsk->exec_state, exec_state, true);
+}
+
+/*
+ * The non-CLONE_VM clone path: allocate a fresh exec_state and
+ * inherit the parent's dumpable mode and user_ns reference. CLONE_VM
+ * siblings refcount-share via copy_exec_state() in fork.c; only this
+ * path and execve() ever allocate.
+ */
+int task_exec_state_copy(struct task_struct *tsk)
+{
+ struct task_exec_state *src, *dst;
+
+ src = rcu_dereference_protected(current->exec_state, true);
+ dst = alloc_task_exec_state(src->user_ns);
+ if (!dst)
+ return -ENOMEM;
+ dst->dumpable = READ_ONCE(src->dumpable);
+ rcu_assign_pointer(tsk->exec_state, dst);
+ return 0;
+}
+
+/*
+ * Store TASK_DUMPABLE_* on current->exec_state. All callers
+ * (commit_creds, begin_new_exec, prctl(PR_SET_DUMPABLE)) act on the
+ * running task, which guarantees ->exec_state is allocated and cannot
+ * be replaced under us.
+ */
+void task_exec_state_set_dumpable(enum task_dumpable value)
+{
+ struct task_exec_state *exec_state;
+
+ if (WARN_ON_ONCE(value > TASK_DUMPABLE_ROOT))
+ value = TASK_DUMPABLE_OFF;
+
+ exec_state = rcu_dereference_protected(current->exec_state, true);
+ /* mm-less tasks share init_task's exec_state; never mutate it */
+ if (WARN_ON_ONCE(exec_state == &init_task_exec_state))
+ return;
+ WRITE_ONCE(exec_state->dumpable, value);
+}
+
+enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task)
+{
+ struct task_exec_state *exec_state;
+
+ guard(rcu)();
+ exec_state = rcu_dereference(task->exec_state);
+ return READ_ONCE(exec_state->dumpable);
+}
+
+void __init exec_state_init(void)
+{
+ task_exec_state_cachep = kmem_cache_create("task_exec_state",
+ sizeof(struct task_exec_state), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 25e9cb6de7e7..1056422bc101 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm)
}
#endif /* CONFIG_MEMCG */
+#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING)
+/*
+ * Subtract the memory footprint of the current task from
+ * mm.
+ */
+static void exit_mm_sched_cache(struct mm_struct *mm)
+{
+ unsigned long fp, sub;
+
+ if (!current->total_numa_faults)
+ return;
+ /*
+ * No lock protection due to performance considerations.
+ * Make sure mm->sc_stat.footprint does not become
+ * negative.
+ */
+ fp = READ_ONCE(mm->sc_stat.footprint);
+ sub = min(fp, current->total_numa_faults);
+ WRITE_ONCE(mm->sc_stat.footprint, fp - sub);
+}
+#else
+static inline void exit_mm_sched_cache(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */
+
/*
* Turn us into a lazy TLB process if we
* aren't already..
@@ -554,6 +580,9 @@ static void exit_mm(void)
exit_mm_release(current, mm);
if (!mm)
return;
+
+ exit_mm_sched_cache(mm);
+
mmap_read_lock(mm);
mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
@@ -988,8 +1017,8 @@ void __noreturn do_exit(long code)
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
- if (unlikely(current->pi_state_cache))
- kfree(current->pi_state_cache);
+ if (unlikely(current->futex.pi_state_cache))
+ kfree(current->futex.pi_state_cache);
#endif
/*
* Make sure we are holding no locks:
@@ -1073,6 +1102,7 @@ void __noreturn make_task_dead(int signr)
futex_exit_recursive(tsk);
tsk->exit_state = EXIT_DEAD;
refcount_inc(&tsk->rcu_users);
+ preempt_disable();
do_task_dead();
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 5f3fdfdb14c7..addc555a1077 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/sched/ext.h>
+#include <linux/sched/exec_state.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
@@ -555,6 +556,7 @@ void free_task(struct task_struct *tsk)
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
bpf_task_storage_free(tsk);
+ put_task_exec_state(rcu_access_pointer(tsk->exec_state));
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -726,12 +728,12 @@ void __mmdrop(struct mm_struct *mm)
cleanup_lazy_tlbs(mm);
WARN_ON_ONCE(mm == current->active_mm);
+ mm_destroy_sched(mm);
mm_free_pgd(mm);
mm_free_id(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
- put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
@@ -946,6 +948,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->seccomp.filter = NULL;
#endif
+ RCU_INIT_POINTER(tsk->exec_state, NULL);
+
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
@@ -1072,8 +1076,7 @@ static void mmap_init_lock(struct mm_struct *mm)
#endif
}
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
- struct user_namespace *user_ns)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
@@ -1101,6 +1104,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
#endif
mm_init_uprobes_state(mm);
hugetlb_count_init(mm);
+ futex_mm_init(mm);
mm_flags_clear_all(mm);
if (current->mm) {
@@ -1113,11 +1117,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->def_flags = 0;
}
- if (futex_mm_init(mm))
- goto fail_mm_init;
-
if (mm_alloc_pgd(mm))
- goto fail_nopgd;
+ goto fail_mm_init;
if (mm_alloc_id(mm))
goto fail_noid;
@@ -1128,15 +1129,19 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (mm_alloc_cid(mm, p))
goto fail_cid;
+ if (mm_alloc_sched(mm))
+ goto fail_sched;
+
if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
NR_MM_COUNTERS))
goto fail_pcpu;
- mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
fail_pcpu:
+ mm_destroy_sched(mm);
+fail_sched:
mm_destroy_cid(mm);
fail_cid:
destroy_context(mm);
@@ -1144,8 +1149,6 @@ fail_nocontext:
mm_free_id(mm);
fail_noid:
mm_free_pgd(mm);
-fail_nopgd:
- futex_hash_free(mm);
fail_mm_init:
free_mm(mm);
return NULL;
@@ -1163,7 +1166,7 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- return mm_init(mm, current, current_user_ns());
+ return mm_init(mm, current);
}
EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
@@ -1527,7 +1530,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm, tsk, mm->user_ns))
+ if (!mm_init(mm, tsk))
goto fail_nomem;
uprobe_start_dup_mmap();
@@ -1593,6 +1596,22 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
return 0;
}
+static int copy_exec_state(u64 clone_flags, struct task_struct *tsk)
+{
+ struct task_exec_state *exec_state;
+
+ /* CLONE_VM siblings refcount-share the parent's exec_state. */
+ if (clone_flags & CLONE_VM) {
+ exec_state = rcu_dereference_protected(current->exec_state, true);
+ refcount_inc(&exec_state->count);
+ rcu_assign_pointer(tsk->exec_state, exec_state);
+ return 0;
+ }
+
+ /* Everyone else inherits a fresh copy. */
+ return task_exec_state_copy(tsk);
+}
+
static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
@@ -2090,6 +2109,9 @@ __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
+ retval = copy_exec_state(clone_flags, p);
+ if (retval)
+ goto bad_fork_free;
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
@@ -2218,6 +2240,7 @@ __latent_entropy struct task_struct *copy_process(
lockdep_init_task(p);
p->blocked_on = NULL; /* not blocked yet */
+ p->blocked_donor = NULL; /* nobody is boosting p yet */
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
@@ -2664,8 +2687,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
- *
- * args->exit_signal is expected to be checked for sanity by the caller.
*/
pid_t kernel_clone(struct kernel_clone_args *args)
{
@@ -2700,6 +2721,9 @@ pid_t kernel_clone(struct kernel_clone_args *args)
(args->pidfd == args->parent_tid))
return -EINVAL;
+ if (!valid_signal(args->exit_signal))
+ return -EINVAL;
+
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
@@ -2898,11 +2922,9 @@ static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
return -EINVAL;
/*
- * Verify that higher 32bits of exit_signal are unset and that
- * it is a valid signal
+ * Verify that higher 32bits of exit_signal are unset
*/
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
- !valid_signal(args.exit_signal)))
+ if (unlikely(args.exit_signal & ~((u64)CSIGNAL)))
return -EINVAL;
if ((args.flags & CLONE_INTO_CGROUP) &&
@@ -3098,6 +3120,7 @@ void __init proc_caches_init(void)
sizeof(struct signal_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
+ exec_state_init();
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index ff2a4fb2993f..179b26e9c934 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -32,18 +32,21 @@
* "But they come in a choice of three flavours!"
*/
#include <linux/compat.h>
-#include <linux/jhash.h>
-#include <linux/pagemap.h>
#include <linux/debugfs.h>
-#include <linux/plist.h>
+#include <linux/fault-inject.h>
#include <linux/gfp.h>
-#include <linux/vmalloc.h>
+#include <linux/jhash.h>
#include <linux/memblock.h>
-#include <linux/fault-inject.h>
-#include <linux/slab.h>
-#include <linux/prctl.h>
#include <linux/mempolicy.h>
#include <linux/mmap_lock.h>
+#include <linux/pagemap.h>
+#include <linux/plist.h>
+#include <linux/prctl.h>
+#include <linux/rseq.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <vdso/futex.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -124,7 +127,7 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */
static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
static bool futex_ref_get(struct futex_private_hash *fph);
@@ -133,15 +136,6 @@ static bool futex_ref_is_dead(struct futex_private_hash *fph);
enum { FR_PERCPU = 0, FR_ATOMIC };
-static inline bool futex_key_is_private(union futex_key *key)
-{
- /*
- * Relies on get_futex_key() to set either bit for shared
- * futexes -- see comment with union futex_key.
- */
- return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
-}
-
static bool futex_private_hash_get(struct futex_private_hash *fph)
{
return futex_ref_get(fph);
@@ -149,51 +143,18 @@ static bool futex_private_hash_get(struct futex_private_hash *fph)
void futex_private_hash_put(struct futex_private_hash *fph)
{
- if (futex_ref_put(fph))
+ if (fph && futex_ref_put(fph))
wake_up_var(fph->mm);
}
-/**
- * futex_hash_get - Get an additional reference for the local hash.
- * @hb: ptr to the private local hash.
- *
- * Obtain an additional reference for the already obtained hash bucket. The
- * caller must already own an reference.
- */
-void futex_hash_get(struct futex_hash_bucket *hb)
-{
- struct futex_private_hash *fph = hb->priv;
-
- if (!fph)
- return;
- WARN_ON_ONCE(!futex_private_hash_get(fph));
-}
-
-void futex_hash_put(struct futex_hash_bucket *hb)
-{
- struct futex_private_hash *fph = hb->priv;
-
- if (!fph)
- return;
- futex_private_hash_put(fph);
-}
-
static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
u32 hash;
- if (!futex_key_is_private(key))
- return NULL;
-
- if (!fph)
- fph = rcu_dereference(key->private.mm->futex_phash);
- if (!fph || !fph->hash_mask)
- return NULL;
-
- hash = jhash2((void *)&key->private.address,
- sizeof(key->private.address) / 4,
+ hash = jhash2((void *)&key->private.address, sizeof(key->private.address) / 4,
key->both.offset);
+
return &fph->queues[hash & fph->hash_mask];
}
@@ -211,13 +172,12 @@ static void futex_rehash_private(struct futex_private_hash *old,
spin_lock(&hb_old->lock);
plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
-
plist_del(&this->list, &hb_old->chain);
futex_hb_waiters_dec(hb_old);
WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
- hb_new = __futex_hash(&this->key, new);
+ hb_new = __futex_hash(&this->key, new, NULL);
futex_hb_waiters_inc(hb_new);
/*
* The new pointer isn't published yet but an already
@@ -232,18 +192,17 @@ static void futex_rehash_private(struct futex_private_hash *old,
}
}
-static bool __futex_pivot_hash(struct mm_struct *mm,
- struct futex_private_hash *new)
+static bool __futex_pivot_hash(struct mm_struct *mm, struct futex_private_hash *new)
{
+ struct futex_mm_phash *mmph = &mm->futex.phash;
struct futex_private_hash *fph;
- WARN_ON_ONCE(mm->futex_phash_new);
+ WARN_ON_ONCE(mmph->hash_new);
- fph = rcu_dereference_protected(mm->futex_phash,
- lockdep_is_held(&mm->futex_hash_lock));
+ fph = rcu_dereference_protected(mmph->hash, lockdep_is_held(&mmph->lock));
if (fph) {
if (!futex_ref_is_dead(fph)) {
- mm->futex_phash_new = new;
+ mmph->hash_new = new;
return false;
}
@@ -251,8 +210,8 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
}
new->state = FR_PERCPU;
scoped_guard(rcu) {
- mm->futex_batches = get_state_synchronize_rcu();
- rcu_assign_pointer(mm->futex_phash, new);
+ mmph->batches = get_state_synchronize_rcu();
+ rcu_assign_pointer(mmph->hash, new);
}
kvfree_rcu(fph, rcu);
return true;
@@ -260,20 +219,19 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
static void futex_pivot_hash(struct mm_struct *mm)
{
- scoped_guard(mutex, &mm->futex_hash_lock) {
+ scoped_guard(mutex, &mm->futex.phash.lock) {
struct futex_private_hash *fph;
- fph = mm->futex_phash_new;
+ fph = mm->futex.phash.hash_new;
if (fph) {
- mm->futex_phash_new = NULL;
+ mm->futex.phash.hash_new = NULL;
__futex_pivot_hash(mm, fph);
}
}
}
-struct futex_private_hash *futex_private_hash(void)
+struct futex_private_hash *futex_private_hash(struct mm_struct *mm)
{
- struct mm_struct *mm = current->mm;
/*
* Ideally we don't loop. If there is a replacement in progress
* then a new private hash is already prepared and a reference can't be
@@ -288,7 +246,7 @@ again:
scoped_guard(rcu) {
struct futex_private_hash *fph;
- fph = rcu_dereference(mm->futex_phash);
+ fph = rcu_dereference(mm->futex.phash.hash);
if (!fph)
return NULL;
@@ -299,18 +257,17 @@ again:
goto again;
}
-struct futex_hash_bucket *futex_hash(union futex_key *key)
+struct futex_bucket_ref futex_hash(union futex_key *key)
{
- struct futex_private_hash *fph;
- struct futex_hash_bucket *hb;
-
again:
scoped_guard(rcu) {
- hb = __futex_hash(key, NULL);
- fph = hb->priv;
+ struct futex_private_hash *fph = NULL;
+ struct futex_hash_bucket *hb;
+
+ hb = __futex_hash(key, NULL, &fph);
if (!fph || futex_private_hash_get(fph))
- return hb;
+ return (struct futex_bucket_ref){ .hb = hb, .fph = fph };
}
futex_pivot_hash(key->private.mm);
goto again;
@@ -318,15 +275,9 @@ again:
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
-static struct futex_hash_bucket *
-__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+struct futex_bucket_ref futex_hash(union futex_key *key)
{
- return NULL;
-}
-
-struct futex_hash_bucket *futex_hash(union futex_key *key)
-{
- return __futex_hash(key, NULL);
+ return (struct futex_bucket_ref){ .hb = __futex_hash(key, NULL, NULL), .fph = NULL };
}
#endif /* CONFIG_FUTEX_PRIVATE_HASH */
@@ -404,6 +355,8 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
* __futex_hash - Return the hash bucket
* @key: Pointer to the futex key for which the hash is calculated
* @fph: Pointer to private hash if known
+ * @fph_p: Pointer to a private hash pointer; output for the private hash
+ * used when set.
*
* We hash on the keys returned from get_futex_key (see below) and return the
* corresponding hash bucket.
@@ -412,21 +365,24 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
* global hash is returned.
*/
static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p)
{
int node = key->both.node;
u32 hash;
- if (node == FUTEX_NO_NODE) {
- struct futex_hash_bucket *hb;
-
- hb = __futex_hash_private(key, fph);
- if (hb)
- return hb;
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ if (node == FUTEX_NO_NODE && futex_key_is_private(key)) {
+ if (!fph)
+ fph = rcu_dereference(key->private.mm->futex.phash.hash);
+ if (fph && fph->hash_mask) {
+ if (fph_p)
+ *fph_p = fph;
+ return __futex_hash_private(key, fph);
+ }
}
+#endif
- hash = jhash2((u32 *)key,
- offsetof(typeof(*key), both.offset) / sizeof(u32),
+ hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32),
key->both.offset);
if (node == FUTEX_NO_NODE) {
@@ -441,8 +397,7 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
*/
node = (hash >> futex_hashshift) % nr_node_ids;
if (!node_possible(node)) {
- node = find_next_bit_wrap(node_possible_map.bits,
- nr_node_ids, node);
+ node = find_next_bit_wrap(node_possible_map.bits, nr_node_ids, node);
}
}
@@ -459,9 +414,8 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
* Return: Initialized hrtimer_sleeper structure or NULL if no timeout
* value given
*/
-struct hrtimer_sleeper *
-futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
- int flags, u64 range_ns)
+struct hrtimer_sleeper *futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
{
if (!time)
return NULL;
@@ -829,7 +783,7 @@ void wait_for_owner_exiting(int ret, struct task_struct *exiting)
if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
return;
- mutex_lock(&exiting->futex_exit_mutex);
+ mutex_lock(&exiting->futex.exit_mutex);
/*
* No point in doing state checking here. If the waiter got here
* while the task was in exec()->exec_futex_release() then it can
@@ -838,7 +792,7 @@ void wait_for_owner_exiting(int ret, struct task_struct *exiting)
* already. Highly unlikely and not a problem. Just one more round
* through the futex maze.
*/
- mutex_unlock(&exiting->futex_exit_mutex);
+ mutex_unlock(&exiting->futex.exit_mutex);
put_task_struct(exiting);
}
@@ -1012,8 +966,9 @@ void futex_unqueue_pi(struct futex_q *q)
* dying task, and do notification if so:
*/
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
- bool pi, bool pending_op)
+ unsigned int mod, bool pending_op)
{
+ bool pi = !!(mod & FUTEX_ROBUST_MOD_PI);
u32 uval, nval, mval;
pid_t owner;
int err;
@@ -1047,7 +1002,7 @@ retry:
*
* In both cases the following conditions are met:
*
- * 1) task->robust_list->list_op_pending != NULL
+ * 1) task->futex.robust_list->list_op_pending != NULL
* @pending_op == true
* 2) The owner part of user space futex value == 0
* 3) Regular futex: @pi == false
@@ -1065,7 +1020,7 @@ retry:
owner = uval & FUTEX_TID_MASK;
if (pending_op && !pi && !owner) {
- futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, NULL, 1,
FUTEX_BITSET_MATCH_ANY);
return 0;
}
@@ -1119,7 +1074,7 @@ retry:
* PI futexes happens in exit_pi_state():
*/
if (!pi && (uval & FUTEX_WAITERS)) {
- futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, NULL, 1,
FUTEX_BITSET_MATCH_ANY);
}
@@ -1131,31 +1086,30 @@ retry:
*/
static inline int fetch_robust_entry(struct robust_list __user **entry,
struct robust_list __user * __user *head,
- unsigned int *pi)
+ unsigned int *mod)
{
unsigned long uentry;
if (get_user(uentry, (unsigned long __user *)head))
return -EFAULT;
- *entry = (void __user *)(uentry & ~1UL);
- *pi = uentry & 1;
+ *entry = (void __user *)(uentry & ~FUTEX_ROBUST_MOD_MASK);
+ *mod = uentry & FUTEX_ROBUST_MOD_MASK;
return 0;
}
/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * Walk curr->futex.robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
* We silently return on any sign of list-walking problem.
*/
static void exit_robust_list(struct task_struct *curr)
{
- struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list_head __user *head = curr->futex.robust_list;
+ unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod;
struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int next_pi;
unsigned long futex_offset;
int rc;
@@ -1163,7 +1117,7 @@ static void exit_robust_list(struct task_struct *curr)
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (fetch_robust_entry(&entry, &head->list.next, &pi))
+ if (fetch_robust_entry(&entry, &head->list.next, &cur_mod))
return;
/*
* Fetch the relative futex offset:
@@ -1174,7 +1128,7 @@ static void exit_robust_list(struct task_struct *curr)
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pend_mod))
return;
next_entry = NULL; /* avoid warning with gcc */
@@ -1183,20 +1137,20 @@ static void exit_robust_list(struct task_struct *curr)
* Fetch the next entry in the list before calling
* handle_futex_death:
*/
- rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_mod);
/*
* A pending lock might already be on the list, so
* don't process it twice:
*/
if (entry != pending) {
if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi, HANDLE_DEATH_LIST))
+ curr, cur_mod, HANDLE_DEATH_LIST))
return;
}
if (rc)
return;
entry = next_entry;
- pi = next_pi;
+ cur_mod = next_mod;
/*
* Avoid excessively long or circular lists:
*/
@@ -1208,10 +1162,31 @@ static void exit_robust_list(struct task_struct *curr)
if (pending) {
handle_futex_death((void __user *)pending + futex_offset,
- curr, pip, HANDLE_DEATH_PENDING);
+ curr, pend_mod, HANDLE_DEATH_PENDING);
}
}
+static bool robust_list_clear_pending(unsigned long __user *pop)
+{
+ struct robust_list_head __user *head = current->futex.robust_list;
+
+ if (!put_user(0UL, pop))
+ return true;
+
+ /*
+ * Just give up. The robust list head is usually part of TLS, so the
+ * chance that this gets resolved is close to zero.
+ *
+ * If @pop_addr is the robust_list_head::list_op_pending pointer then
+ * clear the robust list head pointer to prevent further damage when the
+ * task exits. Better a few stale futexes than corrupted memory. But
+ * that's mostly an academic exercise.
+ */
+ if (pop == (unsigned long __user *)&head->list_op_pending)
+ current->futex.robust_list = NULL;
+ return false;
+}
+
#ifdef CONFIG_COMPAT
static void __user *futex_uaddr(struct robust_list __user *entry,
compat_long_t futex_offset)
@@ -1227,29 +1202,28 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
*/
static inline int
compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
+ compat_uptr_t __user *head, unsigned int *pflags)
{
if (get_user(*uentry, head))
return -EFAULT;
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
+ *entry = compat_ptr((*uentry) & ~FUTEX_ROBUST_MOD_MASK);
+ *pflags = (unsigned int)(*uentry) & FUTEX_ROBUST_MOD_MASK;
return 0;
}
/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * Walk curr->futex.robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
* We silently return on any sign of list-walking problem.
*/
static void compat_exit_robust_list(struct task_struct *curr)
{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct compat_robust_list_head __user *head = current->futex.compat_robust_list;
+ unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod;
struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int next_pi;
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
@@ -1258,7 +1232,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &cur_mod))
return;
/*
* Fetch the relative futex offset:
@@ -1269,8 +1243,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (compat_fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
+ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pend_mod))
return;
next_entry = NULL; /* avoid warning with gcc */
@@ -1280,7 +1253,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
* handle_futex_death:
*/
rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
+ (compat_uptr_t __user *)&entry->next, &next_mod);
/*
* A pending lock might already be on the list, so
* dont process it twice:
@@ -1288,15 +1261,14 @@ static void compat_exit_robust_list(struct task_struct *curr)
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
- if (handle_futex_death(uaddr, curr, pi,
- HANDLE_DEATH_LIST))
+ if (handle_futex_death(uaddr, curr, cur_mod, HANDLE_DEATH_LIST))
return;
}
if (rc)
return;
uentry = next_uentry;
entry = next_entry;
- pi = next_pi;
+ cur_mod = next_mod;
/*
* Avoid excessively long or circular lists:
*/
@@ -1308,9 +1280,24 @@ static void compat_exit_robust_list(struct task_struct *curr)
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
- handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+ handle_futex_death(uaddr, curr, pend_mod, HANDLE_DEATH_PENDING);
}
}
+
+static bool compat_robust_list_clear_pending(u32 __user *pop)
+{
+ struct compat_robust_list_head __user *head = current->futex.compat_robust_list;
+
+ if (!put_user(0U, pop))
+ return true;
+
+ /* See comment in robust_list_clear_pending(). */
+ if (pop == &head->list_op_pending)
+ current->futex.compat_robust_list = NULL;
+ return false;
+}
+#else
+static bool compat_robust_list_clear_pending(u32 __user *pop_addr) { return false; }
#endif
#ifdef CONFIG_FUTEX_PI
@@ -1322,7 +1309,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
*/
static void exit_pi_state_list(struct task_struct *curr)
{
- struct list_head *next, *head = &curr->pi_state_list;
+ struct list_head *next, *head = &curr->futex.pi_state_list;
struct futex_pi_state *pi_state;
union futex_key key = FUTEX_KEY_INIT;
@@ -1336,7 +1323,7 @@ static void exit_pi_state_list(struct task_struct *curr)
* on the mutex.
*/
WARN_ON(curr != current);
- guard(private_hash)();
+ guard(private_hash)(current->mm);
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
@@ -1348,7 +1335,8 @@ static void exit_pi_state_list(struct task_struct *curr)
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
if (1) {
- CLASS(hb, hb)(&key);
+ CLASS(hbr, hbr)(&key);
+ auto hb = hbr.hb;
/*
* We can race against put_pi_state() removing itself from the
@@ -1404,21 +1392,50 @@ static void exit_pi_state_list(struct task_struct *curr)
static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif
+bool futex_robust_list_clear_pending(void __user *pop, unsigned int flags)
+{
+ bool size32bit = !!(flags & FLAGS_ROBUST_LIST32);
+
+ if (!IS_ENABLED(CONFIG_64BIT) && !size32bit)
+ return false;
+
+ if (IS_ENABLED(CONFIG_64BIT) && size32bit)
+ return compat_robust_list_clear_pending(pop);
+
+ return robust_list_clear_pending(pop);
+}
+
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+void __futex_fixup_robust_unlock(struct pt_regs *regs, struct futex_unlock_cs_range *csr)
+{
+ /*
+ * arch_futex_robust_unlock_get_pop() returns the list pending op pointer from
+ * @regs if the try_cmpxchg() succeeded.
+ */
+ void __user *pop = arch_futex_robust_unlock_get_pop(regs);
+
+ if (!pop)
+ return;
+
+ futex_robust_list_clear_pending(pop, csr->pop_size32 ? FLAGS_ROBUST_LIST32 : 0);
+}
+#endif /* CONFIG_FUTEX_ROBUST_UNLOCK */
+
static void futex_cleanup(struct task_struct *tsk)
{
- if (unlikely(tsk->robust_list)) {
+ if (unlikely(tsk->futex.robust_list)) {
exit_robust_list(tsk);
- tsk->robust_list = NULL;
+ tsk->futex.robust_list = NULL;
}
#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
+ if (unlikely(tsk->futex.compat_robust_list)) {
compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
+ tsk->futex.compat_robust_list = NULL;
}
#endif
- if (unlikely(!list_empty(&tsk->pi_state_list)))
+ if (unlikely(!list_empty(&tsk->futex.pi_state_list)))
exit_pi_state_list(tsk);
}
@@ -1442,23 +1459,23 @@ static void futex_cleanup(struct task_struct *tsk)
void futex_exit_recursive(struct task_struct *tsk)
{
/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
- if (tsk->futex_state == FUTEX_STATE_EXITING) {
- __assume_ctx_lock(&tsk->futex_exit_mutex);
- mutex_unlock(&tsk->futex_exit_mutex);
+ if (tsk->futex.state == FUTEX_STATE_EXITING) {
+ __assume_ctx_lock(&tsk->futex.exit_mutex);
+ mutex_unlock(&tsk->futex.exit_mutex);
}
- tsk->futex_state = FUTEX_STATE_DEAD;
+ tsk->futex.state = FUTEX_STATE_DEAD;
}
static void futex_cleanup_begin(struct task_struct *tsk)
- __acquires(&tsk->futex_exit_mutex)
+ __acquires(&tsk->futex.exit_mutex)
{
/*
* Prevent various race issues against a concurrent incoming waiter
* including live locks by forcing the waiter to block on
- * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * tsk->futex.exit_mutex when it observes FUTEX_STATE_EXITING in
* attach_to_pi_owner().
*/
- mutex_lock(&tsk->futex_exit_mutex);
+ mutex_lock(&tsk->futex.exit_mutex);
/*
* Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
@@ -1472,23 +1489,23 @@ static void futex_cleanup_begin(struct task_struct *tsk)
* be observed in exit_pi_state_list().
*/
raw_spin_lock_irq(&tsk->pi_lock);
- tsk->futex_state = FUTEX_STATE_EXITING;
+ tsk->futex.state = FUTEX_STATE_EXITING;
raw_spin_unlock_irq(&tsk->pi_lock);
}
static void futex_cleanup_end(struct task_struct *tsk, int state)
- __releases(&tsk->futex_exit_mutex)
+ __releases(&tsk->futex.exit_mutex)
{
/*
* Lockless store. The only side effect is that an observer might
* take another loop until it becomes visible.
*/
- tsk->futex_state = state;
+ tsk->futex.state = state;
/*
* Drop the exit protection. This unblocks waiters which observed
* FUTEX_STATE_EXITING to reevaluate the state.
*/
- mutex_unlock(&tsk->futex_exit_mutex);
+ mutex_unlock(&tsk->futex.exit_mutex);
}
void futex_exec_release(struct task_struct *tsk)
@@ -1516,12 +1533,8 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
-static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
- struct futex_private_hash *fph)
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
{
-#ifdef CONFIG_FUTEX_PRIVATE_HASH
- fhb->priv = fph;
-#endif
atomic_set(&fhb->waiters, 0);
plist_head_init(&fhb->chain);
spin_lock_init(&fhb->lock);
@@ -1553,17 +1566,17 @@ static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
* otherwise it would be impossible for it to have reported success
* from futex_ref_is_dead().
*/
- WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+ WARN_ON_ONCE(atomic_long_read(&mm->futex.phash.atomic) != 0);
/*
* Set the atomic to the bias value such that futex_ref_{get,put}()
* will never observe 0. Will be fixed up in __futex_ref_atomic_end()
* when folding in the percpu count.
*/
- atomic_long_set(&mm->futex_atomic, LONG_MAX);
+ atomic_long_set(&mm->futex.phash.atomic, LONG_MAX);
smp_store_release(&fph->state, FR_ATOMIC);
- call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+ call_rcu_hurry(&mm->futex.phash.rcu, futex_ref_rcu);
}
static void __futex_ref_atomic_end(struct futex_private_hash *fph)
@@ -1584,7 +1597,7 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph)
* Therefore the per-cpu counter is now stable, sum and reset.
*/
for_each_possible_cpu(cpu) {
- unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+ unsigned int *ptr = per_cpu_ptr(mm->futex.phash.ref, cpu);
count += *ptr;
*ptr = 0;
}
@@ -1592,7 +1605,7 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph)
/*
* Re-init for the next cycle.
*/
- this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ this_cpu_inc(*mm->futex.phash.ref); /* 0 -> 1 */
/*
* Add actual count, subtract bias and initial refcount.
@@ -1600,7 +1613,7 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph)
* The moment this atomic operation happens, futex_ref_is_dead() can
* become true.
*/
- ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex.phash.atomic);
if (!ret)
wake_up_var(mm);
@@ -1610,8 +1623,8 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph)
static void futex_ref_rcu(struct rcu_head *head)
{
- struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
- struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+ struct mm_struct *mm = container_of(head, struct mm_struct, futex.phash.rcu);
+ struct futex_private_hash *fph = rcu_dereference_raw(mm->futex.phash.hash);
if (fph->state == FR_PERCPU) {
/*
@@ -1640,7 +1653,7 @@ static void futex_ref_drop(struct futex_private_hash *fph)
/*
* Can only transition the current fph;
*/
- WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ WARN_ON_ONCE(rcu_dereference_raw(mm->futex.phash.hash) != fph);
/*
* We enqueue at least one RCU callback. Ensure mm stays if the task
* exits before the transition is completed.
@@ -1651,9 +1664,9 @@ static void futex_ref_drop(struct futex_private_hash *fph)
* In order to avoid the following scenario:
*
* futex_hash() __futex_pivot_hash()
- * guard(rcu); guard(mm->futex_hash_lock);
- * fph = mm->futex_phash;
- * rcu_assign_pointer(&mm->futex_phash, new);
+ * guard(rcu); guard(mm->futex.phash.lock);
+ * fph = mm->futex.phash.hash;
+ * rcu_assign_pointer(&mm->futex.phash.hash, new);
* futex_hash_allocate()
* futex_ref_drop()
* fph->state = FR_ATOMIC;
@@ -1668,7 +1681,7 @@ static void futex_ref_drop(struct futex_private_hash *fph)
* There must be at least one full grace-period between publishing a
* new fph and trying to replace it.
*/
- if (poll_state_synchronize_rcu(mm->futex_batches)) {
+ if (poll_state_synchronize_rcu(mm->futex.phash.batches)) {
/*
* There was a grace-period, we can begin now.
*/
@@ -1676,7 +1689,7 @@ static void futex_ref_drop(struct futex_private_hash *fph)
return;
}
- call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+ call_rcu_hurry(&mm->futex.phash.rcu, futex_ref_rcu);
}
static bool futex_ref_get(struct futex_private_hash *fph)
@@ -1686,11 +1699,11 @@ static bool futex_ref_get(struct futex_private_hash *fph)
guard(preempt)();
if (READ_ONCE(fph->state) == FR_PERCPU) {
- __this_cpu_inc(*mm->futex_ref);
+ __this_cpu_inc(*mm->futex.phash.ref);
return true;
}
- return atomic_long_inc_not_zero(&mm->futex_atomic);
+ return atomic_long_inc_not_zero(&mm->futex.phash.atomic);
}
static bool futex_ref_put(struct futex_private_hash *fph)
@@ -1700,11 +1713,11 @@ static bool futex_ref_put(struct futex_private_hash *fph)
guard(preempt)();
if (READ_ONCE(fph->state) == FR_PERCPU) {
- __this_cpu_dec(*mm->futex_ref);
+ __this_cpu_dec(*mm->futex.phash.ref);
return false;
}
- return atomic_long_dec_and_test(&mm->futex_atomic);
+ return atomic_long_dec_and_test(&mm->futex.phash.atomic);
}
static bool futex_ref_is_dead(struct futex_private_hash *fph)
@@ -1716,28 +1729,23 @@ static bool futex_ref_is_dead(struct futex_private_hash *fph)
if (smp_load_acquire(&fph->state) == FR_PERCPU)
return false;
- return atomic_long_read(&mm->futex_atomic) == 0;
+ return atomic_long_read(&mm->futex.phash.atomic) == 0;
}
-int futex_mm_init(struct mm_struct *mm)
+static void futex_hash_init_mm(struct futex_mm_data *fd)
{
- mutex_init(&mm->futex_hash_lock);
- RCU_INIT_POINTER(mm->futex_phash, NULL);
- mm->futex_phash_new = NULL;
- /* futex-ref */
- mm->futex_ref = NULL;
- atomic_long_set(&mm->futex_atomic, 0);
- mm->futex_batches = get_state_synchronize_rcu();
- return 0;
+ memset(&fd->phash, 0, sizeof(fd->phash));
+ mutex_init(&fd->phash.lock);
+ fd->phash.batches = get_state_synchronize_rcu();
}
void futex_hash_free(struct mm_struct *mm)
{
struct futex_private_hash *fph;
- free_percpu(mm->futex_ref);
- kvfree(mm->futex_phash_new);
- fph = rcu_dereference_raw(mm->futex_phash);
+ free_percpu(mm->futex.phash.ref);
+ kvfree(mm->futex.phash.hash_new);
+ fph = rcu_dereference_raw(mm->futex.phash.hash);
if (fph)
kvfree(fph);
}
@@ -1748,10 +1756,10 @@ static bool futex_pivot_pending(struct mm_struct *mm)
guard(rcu)();
- if (!mm->futex_phash_new)
+ if (!mm->futex.phash.hash_new)
return true;
- fph = rcu_dereference(mm->futex_phash);
+ fph = rcu_dereference(mm->futex.phash.hash);
return futex_ref_is_dead(fph);
}
@@ -1793,7 +1801,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
* Once we've disabled the global hash there is no way back.
*/
scoped_guard(rcu) {
- fph = rcu_dereference(mm->futex_phash);
+ fph = rcu_dereference(mm->futex.phash.hash);
if (fph && !fph->hash_mask) {
if (custom)
return -EBUSY;
@@ -1801,15 +1809,15 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
}
}
- if (!mm->futex_ref) {
+ if (!mm->futex.phash.ref) {
/*
* This will always be allocated by the first thread and
* therefore requires no locking.
*/
- mm->futex_ref = alloc_percpu(unsigned int);
- if (!mm->futex_ref)
+ mm->futex.phash.ref = alloc_percpu(unsigned int);
+ if (!mm->futex.phash.ref)
return -ENOMEM;
- this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ this_cpu_inc(*mm->futex.phash.ref); /* 0 -> 1 */
}
fph = kvzalloc(struct_size(fph, queues, hash_slots),
@@ -1822,7 +1830,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
fph->mm = mm;
for (i = 0; i < hash_slots; i++)
- futex_hash_bucket_init(&fph->queues[i], fph);
+ futex_hash_bucket_init(&fph->queues[i]);
if (custom) {
/*
@@ -1832,14 +1840,14 @@ again:
wait_var_event(mm, futex_pivot_pending(mm));
}
- scoped_guard(mutex, &mm->futex_hash_lock) {
+ scoped_guard(mutex, &mm->futex.phash.lock) {
struct futex_private_hash *free __free(kvfree) = NULL;
struct futex_private_hash *cur, *new;
- cur = rcu_dereference_protected(mm->futex_phash,
- lockdep_is_held(&mm->futex_hash_lock));
- new = mm->futex_phash_new;
- mm->futex_phash_new = NULL;
+ cur = rcu_dereference_protected(mm->futex.phash.hash,
+ lockdep_is_held(&mm->futex.phash.lock));
+ new = mm->futex.phash.hash_new;
+ mm->futex.phash.hash_new = NULL;
if (fph) {
if (cur && !cur->hash_mask) {
@@ -1849,7 +1857,7 @@ again:
* the second one returns here.
*/
free = fph;
- mm->futex_phash_new = new;
+ mm->futex.phash.hash_new = new;
return -EBUSY;
}
if (cur && !new) {
@@ -1879,7 +1887,7 @@ again:
if (new) {
/*
- * Will set mm->futex_phash_new on failure;
+ * Will set mm->futex.phash.new_hash on failure;
* futex_private_hash_get() will try again.
*/
if (!__futex_pivot_hash(mm, new) && custom)
@@ -1898,11 +1906,9 @@ int futex_hash_allocate_default(void)
return 0;
scoped_guard(rcu) {
- threads = min_t(unsigned int,
- get_nr_threads(current),
- num_online_cpus());
+ threads = min_t(unsigned int, get_nr_threads(current), num_online_cpus());
- fph = rcu_dereference(current->mm->futex_phash);
+ fph = rcu_dereference(current->mm->futex.phash.hash);
if (fph) {
if (fph->custom)
return 0;
@@ -1929,24 +1935,52 @@ static int futex_hash_get_slots(void)
struct futex_private_hash *fph;
guard(rcu)();
- fph = rcu_dereference(current->mm->futex_phash);
+ fph = rcu_dereference(current->mm->futex.phash.hash);
if (fph && fph->hash_mask)
return fph->hash_mask + 1;
return 0;
}
+#else /* CONFIG_FUTEX_PRIVATE_HASH */
+static inline int futex_hash_allocate(unsigned int hslots, unsigned int flags) { return -EINVAL; }
+static inline int futex_hash_get_slots(void) { return 0; }
+static inline void futex_hash_init_mm(struct futex_mm_data *fd) { }
+#endif /* !CONFIG_FUTEX_PRIVATE_HASH */
-#else
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+static void futex_invalidate_cs_ranges(struct futex_mm_data *fd)
+{
+ /*
+ * Invalidate start_ip so that the quick check fails for ip >= start_ip
+ * if VDSO is not mapped or the second slot is not available for compat
+ * tasks as they use VDSO32 which does not provide the 64-bit pointer
+ * variant.
+ */
+ for (int i = 0; i < FUTEX_ROBUST_MAX_CS_RANGES; i++)
+ fd->unlock.cs_ranges[i].start_ip = ~0UL;
+}
-static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+void futex_reset_cs_ranges(struct futex_mm_data *fd)
{
- return -EINVAL;
+ memset(fd->unlock.cs_ranges, 0, sizeof(fd->unlock.cs_ranges));
+ futex_invalidate_cs_ranges(fd);
}
-static int futex_hash_get_slots(void)
+static void futex_robust_unlock_init_mm(struct futex_mm_data *fd)
{
- return 0;
+ /* mm_dup() preserves the range, mm_alloc() clears it */
+ if (!fd->unlock.cs_ranges[0].start_ip)
+ futex_invalidate_cs_ranges(fd);
}
+#else /* CONFIG_FUTEX_ROBUST_UNLOCK */
+static inline void futex_robust_unlock_init_mm(struct futex_mm_data *fd) { }
+#endif /* !CONFIG_FUTEX_ROBUST_UNLOCK */
+#if defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_FUTEX_ROBUST_UNLOCK)
+void futex_mm_init(struct mm_struct *mm)
+{
+ futex_hash_init_mm(&mm->futex);
+ futex_robust_unlock_init_mm(&mm->futex);
+}
#endif
int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
@@ -2001,7 +2035,7 @@ static int __init futex_init(void)
BUG_ON(!table);
for (i = 0; i < hashsize; i++)
- futex_hash_bucket_init(&table[i], NULL);
+ futex_hash_bucket_init(&table[i]);
futex_queues[n] = table;
}
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 9f6bf6f585fc..f00f0863ed44 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -40,6 +40,8 @@
#define FLAGS_NUMA 0x0080
#define FLAGS_STRICT 0x0100
#define FLAGS_MPOL 0x0200
+#define FLAGS_ROBUST_UNLOCK 0x0400
+#define FLAGS_ROBUST_LIST32 0x0800
/* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op)
@@ -52,6 +54,12 @@ static inline unsigned int futex_to_flags(unsigned int op)
if (op & FUTEX_CLOCK_REALTIME)
flags |= FLAGS_CLOCKRT;
+ if (op & FUTEX_ROBUST_UNLOCK)
+ flags |= FLAGS_ROBUST_UNLOCK;
+
+ if (op & FUTEX_ROBUST_LIST32)
+ flags |= FLAGS_ROBUST_LIST32;
+
return flags;
}
@@ -126,6 +134,15 @@ static inline bool should_fail_futex(bool fshared)
}
#endif
+static inline bool futex_key_is_private(union futex_key *key)
+{
+ /*
+ * Relies on get_futex_key() to set either bit for shared
+ * futexes -- see comment with union futex_key.
+ */
+ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
/*
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
@@ -135,7 +152,6 @@ struct futex_hash_bucket {
atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
- struct futex_private_hash *priv;
} ____cacheline_aligned_in_smp;
/*
@@ -175,7 +191,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
* @requeue_state: State field for futex_requeue_pi()
- * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true
+ * @drop_fph: Waiter should drop the extra private hash reference when set
* @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
*
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
@@ -202,7 +218,7 @@ struct futex_q {
union futex_key *requeue_pi_key;
u32 bitset;
atomic_t requeue_state;
- bool drop_hb_ref;
+ struct futex_private_hash *drop_fph;
#ifdef CONFIG_PREEMPT_RT
struct rcuwait requeue_wait;
#endif
@@ -222,28 +238,29 @@ extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);
-extern struct futex_hash_bucket *futex_hash(union futex_key *key);
-#ifdef CONFIG_FUTEX_PRIVATE_HASH
-extern void futex_hash_get(struct futex_hash_bucket *hb);
-extern void futex_hash_put(struct futex_hash_bucket *hb);
+struct futex_bucket_ref {
+ struct futex_hash_bucket *hb;
+ struct futex_private_hash *fph;
+};
-extern struct futex_private_hash *futex_private_hash(void);
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+extern struct futex_private_hash *futex_private_hash(struct mm_struct *mm);
extern void futex_private_hash_put(struct futex_private_hash *fph);
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
-static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
-static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
-static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline struct futex_private_hash *futex_private_hash(struct mm_struct *mm) { return NULL; }
static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
#endif
-DEFINE_CLASS(hb, struct futex_hash_bucket *,
- if (_T) futex_hash_put(_T),
+extern struct futex_bucket_ref futex_hash(union futex_key *key);
+
+DEFINE_CLASS(hbr, struct futex_bucket_ref,
+ if (_T.fph) futex_private_hash_put(_T.fph),
futex_hash(key), union futex_key *key);
DEFINE_CLASS(private_hash, struct futex_private_hash *,
if (_T) futex_private_hash_put(_T),
- futex_private_hash(), void);
+ futex_private_hash(mm), struct mm_struct *mm);
/**
* futex_match - Check whether two futex keys are equal
@@ -449,13 +466,16 @@ extern int futex_unqueue_multiple(struct futex_vector *v, int count);
extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
struct hrtimer_sleeper *to);
-extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);
+extern int futex_wake(u32 __user *uaddr, unsigned int flags, void __user *pop,
+ int nr_wake, u32 bitset);
extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
-extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *pop);
extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
+bool futex_robust_list_clear_pending(void __user *pop, unsigned int flags);
+
#endif /* _FUTEX_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 643199fdbe62..795011ea1202 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -14,7 +14,7 @@ int refill_pi_state_cache(void)
{
struct futex_pi_state *pi_state;
- if (likely(current->pi_state_cache))
+ if (likely(current->futex.pi_state_cache))
return 0;
pi_state = kzalloc_obj(*pi_state);
@@ -28,17 +28,17 @@ int refill_pi_state_cache(void)
refcount_set(&pi_state->refcount, 1);
pi_state->key = FUTEX_KEY_INIT;
- current->pi_state_cache = pi_state;
+ current->futex.pi_state_cache = pi_state;
return 0;
}
static struct futex_pi_state *alloc_pi_state(void)
{
- struct futex_pi_state *pi_state = current->pi_state_cache;
+ struct futex_pi_state *pi_state = current->futex.pi_state_cache;
WARN_ON(!pi_state);
- current->pi_state_cache = NULL;
+ current->futex.pi_state_cache = NULL;
return pi_state;
}
@@ -60,7 +60,7 @@ static void pi_state_update_owner(struct futex_pi_state *pi_state,
if (new_owner) {
raw_spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &new_owner->pi_state_list);
+ list_add(&pi_state->list, &new_owner->futex.pi_state_list);
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
}
@@ -96,7 +96,7 @@ void put_pi_state(struct futex_pi_state *pi_state)
raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
}
- if (current->pi_state_cache) {
+ if (current->futex.pi_state_cache) {
kfree(pi_state);
} else {
/*
@@ -106,7 +106,7 @@ void put_pi_state(struct futex_pi_state *pi_state)
*/
pi_state->owner = NULL;
refcount_set(&pi_state->refcount, 1);
- current->pi_state_cache = pi_state;
+ current->futex.pi_state_cache = pi_state;
}
}
@@ -179,7 +179,7 @@ void put_pi_state(struct futex_pi_state *pi_state)
*
* p->pi_lock:
*
- * p->pi_state_list -> pi_state->list, relation
+ * p->futex.pi_state_list -> pi_state->list, relation
* pi_mutex->owner -> pi_state->owner, relation
*
* pi_state->refcount:
@@ -327,7 +327,7 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
* caller that the alleged owner is busy.
*/
- if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ if (tsk && tsk->futex.state != FUTEX_STATE_DEAD)
return -EBUSY;
/*
@@ -346,8 +346,8 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
* *uaddr = 0xC0000000; tsk = get_task(PID);
* } if (!tsk->flags & PF_EXITING) {
* ... attach();
- * tsk->futex_state = } else {
- * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * tsk->futex.state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex.state !=
* FUTEX_STATE_DEAD)
* return -EAGAIN;
* return -ESRCH; <--- FAIL
@@ -396,7 +396,7 @@ static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
pi_state->key = *key;
WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
+ list_add(&pi_state->list, &p->futex.pi_state_list);
/*
* Assignment without holding pi_state->pi_mutex.wait_lock is safe
* because there is no concurrency as the object is not published yet.
@@ -440,7 +440,7 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
* in futex_exit_release(), we do this protected by p->pi_lock:
*/
raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+ if (unlikely(p->futex.state != FUTEX_STATE_OK)) {
/*
* The task is on the way out. When the futex state is
* FUTEX_STATE_DEAD, we know that the task has finished
@@ -945,7 +945,8 @@ retry:
retry_private:
if (1) {
- CLASS(hb, hb)(&q.key);
+ CLASS(hbr, hbr)(&q.key);
+ auto hb = hbr.hb;
futex_q_lock(&q, hb);
@@ -1009,7 +1010,7 @@ retry_private:
* the thread, performing resize, will block on hb->lock during
* the requeue.
*/
- futex_hash_put(no_free_ptr(hb));
+ futex_private_hash_put(no_free_ptr(hbr.fph));
/*
* Must be done before we enqueue the waiter, here is unfortunately
* under the hb lock, but that *should* work because it does nothing.
@@ -1100,11 +1101,9 @@ no_block:
__release(&hb->lock);
futex_unqueue_pi(&q);
spin_unlock(q.lock_ptr);
- if (q.drop_hb_ref) {
- CLASS(hb, hb)(&q.key);
- /* Additional reference from futex_unlock_pi() */
- futex_hash_put(hb);
- }
+
+ /* Additional reference from futex_unlock_pi() */
+ futex_private_hash_put(q.drop_fph);
goto out;
out_unlock_put_key:
@@ -1139,7 +1138,7 @@ out:
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+static int __futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
@@ -1148,7 +1147,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
if (!IS_ENABLED(CONFIG_FUTEX_PI))
return -ENOSYS;
-
retry:
if (get_user(uval, uaddr))
return -EFAULT;
@@ -1162,7 +1160,8 @@ retry:
if (ret)
return ret;
- CLASS(hb, hb)(&key);
+ CLASS(hbr, hbr)(&key);
+ auto hb = hbr.hb;
spin_lock(&hb->lock);
retry_hb:
@@ -1219,8 +1218,9 @@ retry_hb:
* Acquire a reference for the leaving waiter to ensure
* valid futex_q::lock_ptr.
*/
- futex_hash_get(hb);
- top_waiter->drop_hb_ref = true;
+ if (futex_key_is_private(&key))
+ top_waiter->drop_fph = futex_private_hash(key.private.mm);
+
__futex_unqueue(top_waiter);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
goto retry_hb;
@@ -1302,3 +1302,15 @@ pi_faulted:
return ret;
}
+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *pop)
+{
+ int ret = __futex_unlock_pi(uaddr, flags);
+
+ if (ret || !(flags & FLAGS_ROBUST_UNLOCK))
+ return ret;
+
+ if (!futex_robust_list_clear_pending(pop, flags))
+ return -EFAULT;
+
+ return 0;
+}
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index b597cb3d17fc..7384672916fb 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -241,8 +241,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* Acquire a reference for the waiter to ensure valid
* futex_q::lock_ptr.
*/
- futex_hash_get(hb);
- q->drop_hb_ref = true;
+ if (futex_key_is_private(key))
+ q->drop_fph = futex_private_hash(key->private.mm);
q->lock_ptr = &hb->lock;
task = READ_ONCE(q->task);
@@ -459,8 +459,10 @@ retry:
retry_private:
if (1) {
- CLASS(hb, hb1)(&key1);
- CLASS(hb, hb2)(&key2);
+ CLASS(hbr, hbr1)(&key1);
+ CLASS(hbr, hbr2)(&key2);
+ auto hb1 = hbr1.hb;
+ auto hb2 = hbr2.hb;
futex_hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
@@ -643,6 +645,12 @@ retry_private:
continue;
}
+ /* Self-deadlock: non-top waiter already owns the PI futex. */
+ if (rt_mutex_owner(&pi_state->pi_mutex) == this->task) {
+ ret = -EDEADLK;
+ break;
+ }
+
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
this->task);
@@ -832,7 +840,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
switch (futex_requeue_pi_wakeup_sync(&q)) {
case Q_REQUEUE_PI_IGNORE:
{
- CLASS(hb, hb)(&q.key);
+ CLASS(hbr, hbr)(&q.key);
+ auto hb = hbr.hb;
/* The waiter is still on uaddr1 */
spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, to);
@@ -902,11 +911,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
default:
BUG();
}
- if (q.drop_hb_ref) {
- CLASS(hb, hb)(&q.key);
- /* Additional reference from requeue_pi_wake_futex() */
- futex_hash_put(hb);
- }
+ /* Additional reference from requeue_pi_wake_futex() */
+ futex_private_hash_put(q.drop_fph);
out:
if (to) {
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 77ad9691f6a6..2fa19d9d008d 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -25,17 +25,13 @@
* @head: pointer to the list-head
* @len: length of the list-head, as userspace expects
*/
-SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
- size_t, len)
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len)
{
- /*
- * The kernel knows only one size for now:
- */
+ /* The kernel knows only one size for now. */
if (unlikely(len != sizeof(*head)))
return -EINVAL;
- current->robust_list = head;
-
+ current->futex.robust_list = head;
return 0;
}
@@ -43,9 +39,9 @@ static inline void __user *futex_task_robust_list(struct task_struct *p, bool co
{
#ifdef CONFIG_COMPAT
if (compat)
- return p->compat_robust_list;
+ return p->futex.compat_robust_list;
#endif
- return p->robust_list;
+ return p->futex.robust_list;
}
static void __user *futex_get_robust_list_common(int pid, bool compat)
@@ -122,6 +118,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
return -ENOSYS;
}
+ if (flags & FLAGS_ROBUST_UNLOCK) {
+ if (cmd != FUTEX_WAKE &&
+ cmd != FUTEX_WAKE_BITSET &&
+ cmd != FUTEX_UNLOCK_PI)
+ return -ENOSYS;
+ }
+
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
@@ -132,7 +135,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough;
case FUTEX_WAKE_BITSET:
- return futex_wake(uaddr, flags, val, val3);
+ return futex_wake(uaddr, flags, uaddr2, val, val3);
case FUTEX_REQUEUE:
return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
@@ -145,7 +148,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_LOCK_PI2:
return futex_lock_pi(uaddr, flags, timeout, 0);
case FUTEX_UNLOCK_PI:
- return futex_unlock_pi(uaddr, flags);
+ return futex_unlock_pi(uaddr, flags, uaddr2);
case FUTEX_TRYLOCK_PI:
return futex_lock_pi(uaddr, flags, NULL, 1);
case FUTEX_WAIT_REQUEUE_PI:
@@ -379,7 +382,7 @@ SYSCALL_DEFINE4(futex_wake,
if (!futex_validate_input(flags, mask))
return -EINVAL;
- return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
+ return futex_wake(uaddr, FLAGS_STRICT | flags, NULL, nr, mask);
}
/*
@@ -475,15 +478,13 @@ SYSCALL_DEFINE4(futex_requeue,
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
+COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
{
if (unlikely(len != sizeof(*head)))
return -EINVAL;
- current->compat_robust_list = head;
-
+ current->futex.compat_robust_list = head;
return 0;
}
@@ -523,4 +524,3 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */
-
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ceed9d879059..d4483d15d30a 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -150,12 +150,35 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
}
/*
+ * If requested, clear the robust list pending op and unlock the futex
+ */
+static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __user *pop)
+{
+ if (!(flags & FLAGS_ROBUST_UNLOCK))
+ return true;
+
+ /* First unlock the futex, which requires release semantics. */
+ scoped_user_write_access(uaddr, efault)
+ unsafe_atomic_store_release_user(0, uaddr, efault);
+
+ /*
+ * Clear the pending list op now. If that fails, then the task is in
+ * deeper trouble as the robust list head is usually part of the TLS.
+ * The chance of survival is close to zero.
+ */
+ return futex_robust_list_clear_pending(pop, flags);
+
+efault:
+ return false;
+}
+
+/*
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
-int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+int futex_wake(u32 __user *uaddr, unsigned int flags, void __user *pop, int nr_wake, u32 bitset)
{
- struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
+ struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
int ret;
@@ -166,10 +189,14 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (unlikely(ret != 0))
return ret;
+ if (!futex_robust_unlock(uaddr, flags, pop))
+ return -EFAULT;
+
if ((flags & FLAGS_STRICT) && !nr_wake)
return 0;
- CLASS(hb, hb)(&key);
+ CLASS(hbr, hbr)(&key);
+ auto hb = hbr.hb;
/* Make sure we really have tasks to wakeup */
if (!futex_hb_waiters_pending(hb))
@@ -266,8 +293,10 @@ retry:
retry_private:
if (1) {
- CLASS(hb, hb1)(&key1);
- CLASS(hb, hb2)(&key2);
+ CLASS(hbr, hbr1)(&key1);
+ CLASS(hbr, hbr2)(&key2);
+ auto hb1 = hbr1.hb;
+ auto hb2 = hbr2.hb;
double_lock_hb(hb1, hb2);
op_ret = futex_atomic_op_inuser(op, uaddr2);
@@ -409,7 +438,7 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
* Make sure to have a reference on the private_hash such that we
* don't block on rehash after changing the task state below.
*/
- guard(private_hash)();
+ guard(private_hash)(current->mm);
/*
* Enqueuing multiple futexes is tricky, because we need to enqueue
@@ -446,7 +475,8 @@ retry:
u32 val = vs[i].w.val;
if (1) {
- CLASS(hb, hb)(&q->key);
+ CLASS(hbr, hbr)(&q->key);
+ auto hb = hbr.hb;
futex_q_lock(q, hb);
ret = futex_get_value_locked(&uval, uaddr);
@@ -621,7 +651,8 @@ retry:
retry_private:
if (1) {
- CLASS(hb, hb)(&q->key);
+ CLASS(hbr, hbr)(&q->key);
+ auto hb = hbr.hb;
futex_q_lock(q, hb);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6c9b1dc4e7d4..de754db414d1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/irqdomain.h>
+#include <linux/preempt.h>
#include <linux/random.h>
#include <trace/events/irq.h>
@@ -47,9 +48,11 @@ int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
ret = 0;
}
- /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */
- if (!ret)
+ if (!ret) {
+ /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */
irq_mark_irq(irq);
+ irq_proc_update_chip(chip);
+ }
return ret;
}
EXPORT_SYMBOL(irq_set_chip);
@@ -893,7 +896,10 @@ void handle_percpu_irq(struct irq_desc *desc)
*
* action->percpu_dev_id is a pointer to percpu variables which
* contain the real device id for the cpu on which this handler is
- * called
+ * called.
+ *
+ * May be used for NMI interrupt lines, and so may be called in IRQ or NMI
+ * context.
*/
void handle_percpu_devid_irq(struct irq_desc *desc)
{
@@ -930,7 +936,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
enabled ? " and unmasked" : "", irq, cpu);
}
- add_interrupt_randomness(irq);
+ if (!in_nmi())
+ add_interrupt_randomness(irq);
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
@@ -1007,6 +1014,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc)));
irq_activate_and_startup(desc, IRQ_RESEND);
}
+ irq_proc_update_valid(desc);
}
void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
@@ -1067,6 +1075,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
trigger = tmp;
irqd_set(&desc->irq_data, trigger);
+ irq_proc_update_valid(desc);
}
}
EXPORT_SYMBOL_GPL(irq_modify_status);
diff --git a/kernel/irq/debugfs.h b/kernel/irq/debugfs.h
new file mode 100644
index 000000000000..8a9360d5fefb
--- /dev/null
+++ b/kernel/irq/debugfs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_IRQ_DEBUGFS_H
+#define _KERNEL_IRQ_DEBUGFS_H
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include <linux/debugfs.h>
+
+struct irq_bit_descr {
+ unsigned int mask;
+ char *name;
+};
+
+#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
+
+void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
+ const struct irq_bit_descr *sd, int size);
+
+void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
+static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
+{
+ debugfs_remove(desc->debugfs_file);
+ kfree(desc->dev_name);
+}
+void irq_debugfs_copy_devname(int irq, struct device *dev);
+# ifdef CONFIG_IRQ_DOMAIN
+void irq_domain_debugfs_init(struct dentry *root);
+# else
+static inline void irq_domain_debugfs_init(struct dentry *root)
+{
+}
+# endif
+#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
+static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
+{
+}
+static inline void irq_remove_debugfs_entry(struct irq_desc *d)
+{
+}
+static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+}
+#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
+
+#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 9412e57056f5..0ce21dd45404 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -9,8 +9,12 @@
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
#include <linux/pm_runtime.h>
+#include <linux/rcuref.h>
#include <linux/sched/clock.h>
+#include "debugfs.h"
+#include "proc.h"
+
#ifdef CONFIG_SPARSE_IRQ
# define MAX_SPARSE_IRQS INT_MAX
#else
@@ -21,6 +25,7 @@
extern bool noirqdebug;
extern int irq_poll_cpu;
+extern unsigned int total_nr_irqs;
extern struct irqaction chained_action;
@@ -100,9 +105,23 @@ extern void unmask_irq(struct irq_desc *desc);
extern void unmask_threaded_irq(struct irq_desc *desc);
#ifdef CONFIG_SPARSE_IRQ
-static inline void irq_mark_irq(unsigned int irq) { }
+static __always_inline void irq_mark_irq(unsigned int irq) { }
+void irq_desc_free_rcu(struct irq_desc *desc);
+
+static __always_inline bool irq_desc_get_ref(struct irq_desc *desc)
+{
+ return rcuref_get(&desc->refcnt);
+}
+
+static __always_inline void irq_desc_put_ref(struct irq_desc *desc)
+{
+ if (rcuref_put(&desc->refcnt))
+ irq_desc_free_rcu(desc);
+}
#else
extern void irq_mark_irq(unsigned int irq);
+static __always_inline bool irq_desc_get_ref(struct irq_desc *desc) { return true; }
+static __always_inline void irq_desc_put_ref(struct irq_desc *desc) { }
#endif
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
@@ -122,6 +141,7 @@ extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void register_handler_proc(unsigned int irq, struct irqaction *action);
extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+void irq_proc_update_valid(struct irq_desc *desc);
#else
static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
@@ -129,8 +149,11 @@ static inline void register_handler_proc(unsigned int irq,
struct irqaction *action) { }
static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
+static inline void irq_proc_update_valid(struct irq_desc *desc) { }
#endif
+struct irq_desc *irq_find_desc_at_or_after(unsigned int offset);
+
extern bool irq_can_set_affinity_usr(unsigned int irq);
extern int irq_do_set_affinity(struct irq_data *data,
@@ -171,7 +194,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
__DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true);
__DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc,
- __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus),
+ if (_T->lock) __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus),
unsigned long flags; bool bus);
static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus,
@@ -372,42 +395,3 @@ static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
return NULL;
#endif
}
-
-#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
-#include <linux/debugfs.h>
-
-struct irq_bit_descr {
- unsigned int mask;
- char *name;
-};
-
-#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
-
-void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
- const struct irq_bit_descr *sd, int size);
-
-void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
-static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
-{
- debugfs_remove(desc->debugfs_file);
- kfree(desc->dev_name);
-}
-void irq_debugfs_copy_devname(int irq, struct device *dev);
-# ifdef CONFIG_IRQ_DOMAIN
-void irq_domain_debugfs_init(struct dentry *root);
-# else
-static inline void irq_domain_debugfs_init(struct dentry *root)
-{
-}
-# endif
-#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
-static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
-{
-}
-static inline void irq_remove_debugfs_entry(struct irq_desc *d)
-{
-}
-static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
-{
-}
-#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 7173b8b634f2..80ef4e27dcf4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -137,17 +137,18 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
+ rcuref_init(&desc->refcnt, 1);
desc_smp_init(desc, node, affinity);
}
-static unsigned int nr_irqs = NR_IRQS;
+unsigned int total_nr_irqs __read_mostly = NR_IRQS;
/**
* irq_get_nr_irqs() - Number of interrupts supported by the system.
*/
unsigned int irq_get_nr_irqs(void)
{
- return nr_irqs;
+ return total_nr_irqs;
}
EXPORT_SYMBOL_GPL(irq_get_nr_irqs);
@@ -157,13 +158,12 @@ EXPORT_SYMBOL_GPL(irq_get_nr_irqs);
*
* Return: @nr.
*/
-unsigned int irq_set_nr_irqs(unsigned int nr)
+unsigned int __init irq_set_nr_irqs(unsigned int nr)
{
- nr_irqs = nr;
-
+ total_nr_irqs = nr;
+ irq_proc_calc_prec();
return nr;
}
-EXPORT_SYMBOL_GPL(irq_set_nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs,
@@ -181,15 +181,12 @@ static int irq_find_free_area(unsigned int from, unsigned int cnt)
return mas.index;
}
-static unsigned int irq_find_at_or_after(unsigned int offset)
+struct irq_desc *irq_find_desc_at_or_after(unsigned int offset)
{
unsigned long index = offset;
- struct irq_desc *desc;
-
- guard(rcu)();
- desc = mt_find(&sparse_irqs, &index, nr_irqs);
- return desc ? irq_desc_get_irq(desc) : nr_irqs;
+ lockdep_assert_in_rcu_read_lock();
+ return mt_find(&sparse_irqs, &index, total_nr_irqs);
}
static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
@@ -466,6 +463,17 @@ static void delayed_free_desc(struct rcu_head *rhp)
kobject_put(&desc->kobj);
}
+void irq_desc_free_rcu(struct irq_desc *desc)
+{
+ /*
+ * We free the descriptor, masks and stat fields via RCU. That
+ * allows demultiplex interrupts to do rcu based management of
+ * the child interrupts.
+ * This also allows us to use rcu in kstat_irqs_usr().
+ */
+ call_rcu(&desc->rcu, delayed_free_desc);
+}
+
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -484,14 +492,7 @@ static void free_desc(unsigned int irq)
*/
irq_sysfs_del(desc);
delete_irq_desc(irq);
-
- /*
- * We free the descriptor, masks and stat fields via RCU. That
- * allows demultiplex interrupts to do rcu based management of
- * the child interrupts.
- * This also allows us to use rcu in kstat_irqs_usr().
- */
- call_rcu(&desc->rcu, delayed_free_desc);
+ irq_desc_put_ref(desc);
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -543,7 +544,8 @@ static bool irq_expand_nr_irqs(unsigned int nr)
{
if (nr > MAX_SPARSE_IRQS)
return false;
- nr_irqs = nr;
+ total_nr_irqs = nr;
+ irq_proc_calc_prec();
return true;
}
@@ -557,21 +559,22 @@ int __init early_irq_init(void)
/* Let arch update nr_irqs and return the nr of preallocated irqs */
initcnt = arch_probe_nr_irqs();
printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
- NR_IRQS, nr_irqs, initcnt);
+ NR_IRQS, total_nr_irqs, initcnt);
- if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS))
- nr_irqs = MAX_SPARSE_IRQS;
+ if (WARN_ON(total_nr_irqs > MAX_SPARSE_IRQS))
+ total_nr_irqs = MAX_SPARSE_IRQS;
if (WARN_ON(initcnt > MAX_SPARSE_IRQS))
initcnt = MAX_SPARSE_IRQS;
- if (initcnt > nr_irqs)
- nr_irqs = initcnt;
+ if (initcnt > total_nr_irqs)
+ total_nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
desc = alloc_desc(i, node, 0, NULL, NULL);
irq_insert_desc(i, desc);
}
+ irq_proc_calc_prec();
return arch_early_irq_init();
}
@@ -592,7 +595,7 @@ int __init early_irq_init(void)
init_irq_default_affinity();
- printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);
+ pr_info("NR_IRQS: %d\n", NR_IRQS);
count = ARRAY_SIZE(irq_desc);
@@ -602,6 +605,7 @@ int __init early_irq_init(void)
goto __free_desc_res;
}
+ irq_proc_calc_prec();
return arch_early_irq_init();
__free_desc_res:
@@ -862,7 +866,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
{
int i;
- if (from >= nr_irqs || (from + cnt) > nr_irqs)
+ if (from >= total_nr_irqs || (from + cnt) > total_nr_irqs)
return;
guard(mutex)(&sparse_irq_lock);
@@ -911,7 +915,7 @@ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int no
if (irq >=0 && start != irq)
return -EEXIST;
- if (start + cnt > nr_irqs) {
+ if (start + cnt > total_nr_irqs) {
if (!irq_expand_nr_irqs(start + cnt))
return -ENOMEM;
}
@@ -923,11 +927,15 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
* irq_get_next_irq - get next allocated irq number
* @offset: where to start the search
*
- * Returns next irq number after offset or nr_irqs if none is found.
+ * Returns next irq number after offset or total_nr_irqs if none is found.
*/
unsigned int irq_get_next_irq(unsigned int offset)
{
- return irq_find_at_or_after(offset);
+ struct irq_desc *desc;
+
+ guard(rcu)();
+ desc = irq_find_desc_at_or_after(offset);
+ return desc ? irq_desc_get_irq(desc) : total_nr_irqs;
}
struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cc93abf009e8..f15c9f1223bb 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -20,6 +20,8 @@
#include <linux/smp.h>
#include <linux/fs.h>
+#include "proc.h"
+
static LIST_HEAD(irq_domain_list);
static DEFINE_MUTEX(irq_domain_mutex);
@@ -1532,6 +1534,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
irq_data->chip_data = chip_data;
+ irq_proc_update_chip(chip);
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
@@ -2081,7 +2084,7 @@ static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
-#include "internals.h"
+#include "debugfs.h"
static struct dentry *domain_dir;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2e8072437826..7eb07e3bdb4c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1802,6 +1802,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
__enable_irq(desc);
}
+ irq_proc_update_valid(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
chip_bus_sync_unlock(desc);
mutex_unlock(&desc->request_mutex);
@@ -1906,6 +1907,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
desc->affinity_hint = NULL;
#endif
+ irq_proc_update_valid(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
/*
* Drop bus_lock here so the changes which were done in the chip
@@ -2026,24 +2028,32 @@ const void *free_irq(unsigned int irq, void *dev_id)
}
EXPORT_SYMBOL(free_irq);
-/* This function must be called with desc->lock held */
static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
{
+ struct irqaction *action = NULL;
const char *devname = NULL;
- desc->istate &= ~IRQS_NMI;
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ irq_nmi_teardown(desc);
- if (!WARN_ON(desc->action == NULL)) {
- irq_pm_remove_action(desc, desc->action);
- devname = desc->action->name;
- unregister_handler_proc(irq, desc->action);
+ desc->istate &= ~IRQS_NMI;
- kfree(desc->action);
+ if (!WARN_ON(desc->action == NULL)) {
+ action = desc->action;
+ irq_pm_remove_action(desc, action);
+ devname = action->name;
+ }
desc->action = NULL;
+
+ irq_settings_clr_disable_unlazy(desc);
+ irq_shutdown_and_deactivate(desc);
}
- irq_settings_clr_disable_unlazy(desc);
- irq_shutdown_and_deactivate(desc);
+ irq_proc_update_valid(desc);
+
+ if (action)
+ unregister_handler_proc(irq, action);
+ kfree(action);
irq_release_resources(desc);
@@ -2067,8 +2077,6 @@ const void *free_nmi(unsigned int irq, void *dev_id)
if (WARN_ON(desc->depth == 0))
disable_nmi_nosync(irq);
- guard(raw_spinlock_irqsave)(&desc->lock);
- irq_nmi_teardown(desc);
return __cleanup_nmi(irq, desc);
}
@@ -2318,13 +2326,14 @@ int request_nmi(unsigned int irq, irq_handler_t handler,
/* Setup NMI state */
desc->istate |= IRQS_NMI;
retval = irq_nmi_setup(desc);
- if (retval) {
- __cleanup_nmi(irq, desc);
- return -EINVAL;
- }
- return 0;
}
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ return -EINVAL;
+ }
+ return 0;
+
err_irq_setup:
irq_chip_pm_put(&desc->irq_data);
err_out:
@@ -2428,8 +2437,10 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
*action_ptr = action->next;
/* Demote from NMI if we killed the last action */
- if (!desc->action)
+ if (!desc->action) {
desc->istate &= ~IRQS_NMI;
+ irq_proc_update_valid(desc);
+ }
}
unregister_handler_proc(irq, action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index b0999a4f1f68..1b835725f7b1 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -10,6 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/mutex.h>
#include <linux/string.h>
@@ -326,7 +327,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
#undef MAX_NAMELEN
-#define MAX_NAMELEN 10
+#define MAX_NAMELEN 11
void register_irq_proc(unsigned int irq, struct irq_desc *desc)
{
@@ -348,7 +349,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
return;
/* create /proc/irq/1234 */
- sprintf(name, "%u", irq);
+ snprintf(name, MAX_NAMELEN, "%u", irq);
desc->dir = proc_mkdir(name, root_irq_dir);
if (!desc->dir)
return;
@@ -401,7 +402,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
#endif
remove_proc_entry("spurious", desc->dir);
- sprintf(name, "%u", irq);
+ snprintf(name, MAX_NAMELEN, "%u", irq);
remove_proc_entry(name, root_irq_dir);
}
@@ -439,77 +440,159 @@ void init_irq_proc(void)
register_irq_proc(irq, desc);
}
+void irq_proc_update_valid(struct irq_desc *desc)
+{
+ u32 set = _IRQ_PROC_VALID;
+
+ if (irq_settings_is_hidden(desc) || irq_desc_is_chained(desc) || !desc->action)
+ set = 0;
+
+ irq_settings_update_proc_valid(desc, set);
+}
+
#ifdef CONFIG_GENERIC_IRQ_SHOW
+#define ARCH_PROC_IRQDESC ((void *)0x00001111)
+
int __weak arch_show_interrupts(struct seq_file *p, int prec)
{
return 0;
}
+static DEFINE_RAW_SPINLOCK(irq_proc_constraints_lock);
+
+static struct irq_proc_constraints {
+ bool print_header;
+ unsigned int num_prec;
+ unsigned int chip_width;
+} irq_proc_constraints __read_mostly = {
+ .num_prec = 4,
+ .chip_width = 8,
+};
+
#ifndef ACTUAL_NR_IRQS
-# define ACTUAL_NR_IRQS irq_get_nr_irqs()
+# define ACTUAL_NR_IRQS total_nr_irqs
#endif
-int show_interrupts(struct seq_file *p, void *v)
+void irq_proc_calc_prec(void)
{
- const unsigned int nr_irqs = irq_get_nr_irqs();
- static int prec;
+ unsigned int prec, n;
- int i = *(loff_t *) v, j;
- struct irqaction *action;
- struct irq_desc *desc;
+ for (prec = 4, n = 10000; prec < 10 && n <= total_nr_irqs; ++prec)
+ n *= 10;
+
+ guard(raw_spinlock_irqsave)(&irq_proc_constraints_lock);
+ if (prec > irq_proc_constraints.num_prec)
+ WRITE_ONCE(irq_proc_constraints.num_prec, prec);
+}
+
+void irq_proc_update_chip(const struct irq_chip *chip)
+{
+ unsigned int len = chip && chip->name ? strlen(chip->name) : 0;
+
+ if (!len || len <= READ_ONCE(irq_proc_constraints.chip_width))
+ return;
+
+ /* Can be invoked from interrupt disabled contexts */
+ guard(raw_spinlock_irqsave)(&irq_proc_constraints_lock);
+ if (len > irq_proc_constraints.chip_width)
+ WRITE_ONCE(irq_proc_constraints.chip_width, len);
+}
+
+/* Same as seq_put_decimal_ull_width(p, " ", cnt, 10) */
+#define ZSTR1 " 0"
+#define ZSTR1_LEN (sizeof(ZSTR1) - 1)
+#define ZSTR16 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 \
+ ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1
+#define ZSTR256 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 \
+ ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16
+
+static inline void irq_proc_emit_zero_counts(struct seq_file *p, unsigned int zeros)
+{
+ if (!zeros)
+ return;
+
+ for (unsigned int n = min(zeros, 256); n; zeros -= n, n = min(zeros, 256))
+ seq_write(p, ZSTR256, n * ZSTR1_LEN);
+}
+
+static inline unsigned int irq_proc_emit_count(struct seq_file *p, unsigned int cnt,
+ unsigned int zeros)
+{
+ if (!cnt)
+ return zeros + 1;
- if (i > ACTUAL_NR_IRQS)
- return 0;
+ irq_proc_emit_zero_counts(p, zeros);
+ seq_put_decimal_ull_width(p, " ", cnt, 10);
+ return 0;
+}
- if (i == ACTUAL_NR_IRQS)
- return arch_show_interrupts(p, prec);
+void irq_proc_emit_counts(struct seq_file *p, unsigned int __percpu *cnts)
+{
+ unsigned int cpu, zeros = 0;
- /* print header and calculate the width of the first column */
- if (i == 0) {
- for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
- j *= 10;
+ for_each_online_cpu(cpu)
+ zeros = irq_proc_emit_count(p, per_cpu(*cnts, cpu), zeros);
+ irq_proc_emit_zero_counts(p, zeros);
+}
- seq_printf(p, "%*s", prec + 8, "");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d", j);
+static int irq_seq_show(struct seq_file *p, void *v)
+{
+ struct irq_proc_constraints *constr = p->private;
+ struct irq_desc *desc = v;
+ struct irqaction *action;
+
+ /* Print header for the first interrupt? */
+ if (constr->print_header) {
+ unsigned int cpu;
+
+ seq_printf(p, "%*s", constr->num_prec + 8, "");
+ for_each_online_cpu(cpu)
+ seq_printf(p, "CPU%-8d", cpu);
seq_putc(p, '\n');
+ constr->print_header = false;
}
- guard(rcu)();
- desc = irq_to_desc(i);
- if (!desc || irq_settings_is_hidden(desc))
- return 0;
+ if (desc == ARCH_PROC_IRQDESC)
+ return arch_show_interrupts(p, constr->num_prec);
- if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs)
- return 0;
+ seq_put_decimal_ull_width(p, "", irq_desc_get_irq(desc), constr->num_prec);
+ seq_putc(p, ':');
- seq_printf(p, "%*d:", prec, i);
- for_each_online_cpu(j) {
- unsigned int cnt = desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0;
+ /*
+ * Always output per CPU interrupts. Output device interrupts only when
+ * desc::tot_count is not zero.
+ */
+ if (irq_settings_is_per_cpu(desc) || irq_settings_is_per_cpu_devid(desc) ||
+ data_race(desc->tot_count))
+ irq_proc_emit_counts(p, &desc->kstat_irqs->cnt);
+ else
+ irq_proc_emit_zero_counts(p, num_online_cpus());
- seq_put_decimal_ull_width(p, " ", cnt, 10);
- }
- seq_putc(p, ' ');
+ /* Enforce a visual gap */
+ seq_write(p, " ", 2);
guard(raw_spinlock_irq)(&desc->lock);
if (desc->irq_data.chip) {
if (desc->irq_data.chip->irq_print_chip)
desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
else if (desc->irq_data.chip->name)
- seq_printf(p, "%8s", desc->irq_data.chip->name);
+ seq_printf(p, "%-*s", constr->chip_width, desc->irq_data.chip->name);
else
- seq_printf(p, "%8s", "-");
+ seq_printf(p, "%-*s", constr->chip_width, "-");
} else {
- seq_printf(p, "%8s", "None");
+ seq_printf(p, "%-*s", constr->chip_width, "None");
}
+
+ seq_putc(p, ' ');
if (desc->irq_data.domain)
- seq_printf(p, " %*lu", prec, desc->irq_data.hwirq);
+ seq_put_decimal_ull_width(p, "", desc->irq_data.hwirq, constr->num_prec);
else
- seq_printf(p, " %*s", prec, "");
-#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
- seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
-#endif
+ seq_printf(p, " %*s", constr->num_prec, "");
+
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_SHOW_LEVEL))
+ seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
+
if (desc->name)
seq_printf(p, "-%-8s", desc->name);
@@ -523,4 +606,73 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
return 0;
}
+
+static void *irq_seq_next_desc(loff_t *pos)
+{
+ if (*pos > total_nr_irqs)
+ return NULL;
+
+ guard(rcu)();
+ for (;;) {
+ struct irq_desc *desc = irq_find_desc_at_or_after((unsigned int) *pos);
+
+ if (desc) {
+ *pos = irq_desc_get_irq(desc);
+ /*
+ * If valid for output then try to acquire a reference
+ * count on the descriptor so that it can't be freed
+ * after dropping RCU read lock on return.
+ */
+ if (irq_settings_proc_valid(desc) && irq_desc_get_ref(desc))
+ return desc;
+ (*pos)++;
+ } else {
+ *pos = total_nr_irqs;
+ return ARCH_PROC_IRQDESC;
+ }
+ }
+}
+
+static void *irq_seq_start(struct seq_file *f, loff_t *pos)
+{
+ if (!*pos) {
+ struct irq_proc_constraints *constr = f->private;
+
+ constr->num_prec = READ_ONCE(irq_proc_constraints.num_prec);
+ constr->chip_width = READ_ONCE(irq_proc_constraints.chip_width);
+ constr->print_header = true;
+ }
+ return irq_seq_next_desc(pos);
+}
+
+static void *irq_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ if (v && v != ARCH_PROC_IRQDESC)
+ irq_desc_put_ref(v);
+
+ (*pos)++;
+ return irq_seq_next_desc(pos);
+}
+
+static void irq_seq_stop(struct seq_file *f, void *v)
+{
+ if (v && v != ARCH_PROC_IRQDESC)
+ irq_desc_put_ref(v);
+}
+
+static const struct seq_operations irq_seq_ops = {
+ .start = irq_seq_start,
+ .next = irq_seq_next,
+ .stop = irq_seq_stop,
+ .show = irq_seq_show,
+};
+
+static int __init irq_proc_init(void)
+{
+ proc_create_seq_private("interrupts", 0, NULL, &irq_seq_ops,
+ sizeof(irq_proc_constraints), NULL);
+ return 0;
+}
+fs_initcall(irq_proc_init);
+
#endif
diff --git a/kernel/irq/proc.h b/kernel/irq/proc.h
new file mode 100644
index 000000000000..0631d57fbfb7
--- /dev/null
+++ b/kernel/irq/proc.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_IRQ_PROC_H
+#define _KERNEL_IRQ_PROC_H
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_GENERIC_IRQ_SHOW)
+void irq_proc_calc_prec(void);
+void irq_proc_update_chip(const struct irq_chip *chip);
+#else
+static inline void irq_proc_calc_prec(void) { }
+static inline void irq_proc_update_chip(const struct irq_chip *chip) { }
+#endif
+
+#endif
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 00b3bd127692..0a0c027a5d34 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -18,6 +18,7 @@ enum {
_IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
_IRQ_HIDDEN = IRQ_HIDDEN,
_IRQ_NO_DEBUG = IRQ_NO_DEBUG,
+ _IRQ_PROC_VALID = IRQ_RESERVED,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -34,6 +35,7 @@ enum {
#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
#define IRQ_HIDDEN GOT_YOU_MORON
#define IRQ_NO_DEBUG GOT_YOU_MORON
+#define IRQ_RESERVED GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -180,3 +182,14 @@ static inline bool irq_settings_no_debug(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_NO_DEBUG;
}
+
+static inline bool irq_settings_proc_valid(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_PROC_VALID;
+}
+
+static inline void irq_settings_update_proc_valid(struct irq_desc *desc, u32 set)
+{
+ desc->status_use_accessors &= ~_IRQ_PROC_VALID;
+ desc->status_use_accessors |= (set & _IRQ_PROC_VALID);
+}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 120fd7365fbe..f7e2dc2c30c6 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -292,6 +292,12 @@ void irq_work_sync(struct irq_work *work)
!arch_irq_work_has_interrupt()) {
rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
TASK_UNINTERRUPTIBLE);
+ /*
+ * Ensure irq_work_single() does not access @work
+ * after removing IRQ_WORK_BUSY. It is always
+ * accessed within a RCU-read section.
+ */
+ synchronize_rcu();
return;
}
@@ -302,6 +308,7 @@ EXPORT_SYMBOL_GPL(irq_work_sync);
static void run_irq_workd(unsigned int cpu)
{
+ guard(rcu)();
irq_work_run_list(this_cpu_ptr(&lazy_list));
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 791210daf8b4..63beb59b7a3d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1619,7 +1619,6 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
- WARN_ON_ONCE(!mm->user_ns);
/*
* It is possible for mm to be the same as tsk->active_mm, but
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 18509d8082ea..1b592d86dc48 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -357,20 +357,6 @@ int kho_radix_walk_tree(struct kho_radix_tree *tree,
}
EXPORT_SYMBOL_GPL(kho_radix_walk_tree);
-static void __kho_unpreserve(struct kho_radix_tree *tree,
- unsigned long pfn, unsigned long end_pfn)
-{
- unsigned int order;
-
- while (pfn < end_pfn) {
- order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
-
- kho_radix_del_page(tree, pfn, order);
-
- pfn += 1 << order;
- }
-}
-
/* For physically contiguous 0-order pages. */
static void kho_init_pages(struct page *page, unsigned long nr_pages)
{
@@ -860,6 +846,37 @@ void kho_unpreserve_folio(struct folio *folio)
}
EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
+static unsigned int __kho_preserve_pages_order(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned int order = min(count_trailing_zeros(start_pfn),
+ ilog2(end_pfn - start_pfn));
+
+ /*
+ * Make sure all the pages in a single preservation are in the same NUMA
+ * node. The restore machinery can not cope with a preservation spanning
+ * multiple NUMA nodes.
+ */
+ while (pfn_to_nid(start_pfn) != pfn_to_nid(start_pfn + (1UL << order) - 1))
+ order--;
+
+ return order;
+}
+
+static void __kho_unpreserve(struct kho_radix_tree *tree,
+ unsigned long pfn, unsigned long end_pfn)
+{
+ unsigned int order;
+
+ while (pfn < end_pfn) {
+ order = __kho_preserve_pages_order(pfn, end_pfn);
+
+ kho_radix_del_page(tree, pfn, order);
+
+ pfn += 1 << order;
+ }
+}
+
/**
* kho_preserve_pages - preserve contiguous pages across kexec
* @page: first page in the list.
@@ -885,16 +902,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages)
}
while (pfn < end_pfn) {
- unsigned int order =
- min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
-
- /*
- * Make sure all the pages in a single preservation are in the
- * same NUMA node. The restore machinery can not cope with a
- * preservation spanning multiple NUMA nodes.
- */
- while (pfn_to_nid(pfn) != pfn_to_nid(pfn + (1UL << order) - 1))
- order--;
+ unsigned int order = __kho_preserve_pages_order(pfn, end_pfn);
err = kho_radix_add_page(tree, pfn, order);
if (err) {
@@ -1707,7 +1715,7 @@ int kho_fill_kimage(struct kimage *image)
int err = 0;
struct kexec_buf scratch;
- if (!kho_enable)
+ if (!kho_enable || image->type == KEXEC_TYPE_CRASH)
return 0;
image->kho.fdt = virt_to_phys(kho_out.fdt);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 09534628dc01..8a85912d7ee6 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -763,6 +763,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
raw_spin_lock_irqsave(&lock->wait_lock, flags);
raw_spin_lock(&current->blocked_lock);
__set_task_blocked_on(current, lock);
+ set_current_state(state);
if (opt_acquired)
break;
@@ -980,9 +981,8 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
__releases(lock)
{
- struct task_struct *next = NULL;
+ struct task_struct *donor, *next = NULL;
struct mutex_waiter *waiter;
- DEFINE_WAKE_Q(wake_q);
unsigned long owner;
unsigned long flags;
@@ -990,6 +990,14 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
__release(lock);
/*
+ * Ensures the proxy donor stack is stable across unlock and handoff.
+ * Specifically, it avoids the case where current->blocked_donor is
+ * NULL when it is inspected while doing the unlock, but a preemption
+ * before taking the wake_lock would make it set and a hand-off is
+ * missed.
+ */
+ guard(preempt)();
+ /*
* Release the lock before (potentially) taking the spinlock such that
* other contenders can get on with things ASAP.
*
@@ -1001,6 +1009,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
MUTEX_WARN_ON(__owner_task(owner) != current);
MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
+ if (sched_proxy_exec() && current->blocked_donor) {
+ /* force handoff if we have a blocked_donor */
+ owner = MUTEX_FLAG_HANDOFF;
+ break;
+ }
+
if (owner & MUTEX_FLAG_HANDOFF)
break;
@@ -1013,20 +1027,56 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
}
raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ raw_spin_lock(&current->blocked_lock);
debug_mutex_unlock(lock);
+
+ if (sched_proxy_exec()) {
+ /*
+ * If we have a task boosting current, and that task was boosting
+ * current through this lock, hand the lock to that task, as that
+ * is the highest waiter, as selected by the scheduling function.
+ */
+ donor = current->blocked_donor;
+ if (donor) {
+ struct mutex *next_lock;
+
+ raw_spin_lock_nested(&donor->blocked_lock, SINGLE_DEPTH_NESTING);
+ next_lock = __get_task_blocked_on(donor);
+ if (next_lock == lock) {
+ next = get_task_struct(donor);
+ __clear_task_blocked_on(next, lock);
+ current->blocked_donor = NULL;
+ }
+ raw_spin_unlock(&donor->blocked_lock);
+ }
+ }
+
+ /*
+ * Failing that, pick first on the wait list.
+ */
waiter = lock->first_waiter;
- if (waiter) {
- next = waiter->task;
+ if (!next && waiter) {
+ next = get_task_struct(waiter->task);
+ raw_spin_lock_nested(&next->blocked_lock, SINGLE_DEPTH_NESTING);
debug_mutex_wake_waiter(lock, waiter);
- set_task_blocked_on_waking(next, lock);
- wake_q_add(&wake_q, next);
+ __clear_task_blocked_on(next, lock);
+ raw_spin_unlock(&next->blocked_lock);
+
}
+ if (trace_contended_release_enabled() && waiter)
+ trace_call__contended_release(lock);
+
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
+ raw_spin_unlock(&current->blocked_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ if (next) {
+ wake_up_process(next);
+ put_task_struct(next);
+ }
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -1220,6 +1270,7 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible);
EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contended_release);
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index ef234469baac..f7e152c40d6d 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -263,6 +263,9 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, _RET_IP_);
+ if (trace_contended_release_enabled() && wq_has_sleeper(&sem->waiters))
+ trace_call__contended_release(sem);
+
/*
* Signal the writer is done, no fast path yet.
*
@@ -288,3 +291,29 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
rcu_sync_exit(&sem->rss);
}
EXPORT_SYMBOL_GPL(percpu_up_write);
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+ lockdep_assert_preemption_disabled();
+ /*
+ * After percpu_up_write() completes, rcu_sync_is_idle() can still
+ * return false during the grace period, forcing readers into this
+ * slowpath. Only trace when a writer is actually waiting for
+ * readers to drain.
+ */
+ if (trace_contended_release_enabled() && rcuwait_active(&sem->writer))
+ trace_call__contended_release(sem);
+ /*
+ * slowpath; reader will only ever wake a single blocked
+ * writer.
+ */
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to
+ * aggregate zero, as that is the only time it matters) they
+ * will also see our critical section.
+ */
+ this_cpu_dec(*sem->read_count);
+ rcuwait_wake_up(&sem->writer);
+}
+EXPORT_SYMBOL_GPL(__percpu_up_read);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 4f386ea6c792..4728631ae719 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -484,6 +484,7 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod
static __always_inline void
rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
+ __must_hold(&lock->wait_lock)
{
lockdep_assert_held(&lock->wait_lock);
@@ -492,6 +493,7 @@ rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
static __always_inline void
rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
+ __must_hold(&lock->wait_lock)
{
lockdep_assert_held(&lock->wait_lock);
@@ -1092,6 +1094,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
static int __sched
try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
+ __must_hold(&lock->wait_lock)
{
lockdep_assert_held(&lock->wait_lock);
@@ -1319,6 +1322,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
*/
static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
struct rt_mutex_base *lock)
+ __must_hold(&lock->wait_lock)
{
struct rt_mutex_waiter *waiter;
@@ -1466,6 +1470,7 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
+ trace_contended_release(lock);
/*
* The wakeup next waiter path does not suffer from the above
* race. See the comments there.
@@ -1558,6 +1563,9 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
lockdep_assert_held(&lock->wait_lock);
+ if (!waiter_task) /* never enqueued */
+ return;
+
scoped_guard(raw_spinlock, &waiter_task->pi_lock) {
rt_mutex_dequeue(lock, waiter);
waiter_task->pi_blocked_on = NULL;
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 124219aea46e..5d48d64725b1 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -41,6 +41,7 @@ static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
unsigned int state,
struct lockdep_map *nest_lock,
unsigned int subclass)
+ __cond_acquires(0, lock)
{
int ret;
@@ -67,13 +68,27 @@ EXPORT_SYMBOL(rt_mutex_base_init);
*/
void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
{
- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
+ if (__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass) == 0)
+ return;
+ /*
+ * The code below is never reached because __rt_mutex_lock_common() only
+ * returns an error code if interrupted by a signal or upon a timeout.
+ */
+ WARN_ON_ONCE(true);
+ __acquire(lock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
{
- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
+ if (__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0) == 0)
+ return;
+ /*
+ * The code below is never reached because __rt_mutex_lock_common() only
+ * returns an error code if interrupted by a signal or upon a timeout.
+ */
+ WARN_ON_ONCE(true);
+ __acquire(lock);
}
EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
@@ -86,7 +101,14 @@ EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
*/
void __sched rt_mutex_lock(struct rt_mutex *lock)
{
- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
+ if (__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0) == 0)
+ return;
+ /*
+ * The code below is never reached because __rt_mutex_lock_common() only
+ * returns an error code if interrupted by a signal or upon a timeout.
+ */
+ WARN_ON_ONCE(true);
+ __acquire(lock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
#endif
@@ -157,6 +179,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
{
mutex_release(&lock->dep_map, _RET_IP_);
__rt_mutex_unlock(&lock->rtmutex);
+ __release(lock);
}
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
@@ -182,6 +205,7 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
*/
bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
struct rt_wake_q_head *wqh)
+ __must_hold(&lock->wait_lock)
{
lockdep_assert_held(&lock->wait_lock);
@@ -312,6 +336,7 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task,
struct wake_q_head *wake_q)
+ __must_hold(&lock->wait_lock)
{
int ret;
@@ -365,7 +390,7 @@ int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
raw_spin_lock_irq(&lock->wait_lock);
ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q);
- if (unlikely(ret))
+ if (unlikely(ret < 0))
remove_waiter(lock, waiter);
preempt_disable();
raw_spin_unlock_irq(&lock->wait_lock);
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 82e078c0665a..2835c9ef9b3f 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -174,6 +174,8 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
unsigned int state)
{
+ if (trace_contended_release_enabled() && rt_mutex_owner(&rwb->rtmutex))
+ trace_call__contended_release(rwb);
/*
* rwb->readers can only hit 0 when a writer is waiting for the
* active readers to leave the critical section.
@@ -205,6 +207,8 @@ static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
unsigned long flags;
raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (trace_contended_release_enabled() && rt_mutex_has_waiters(rtm))
+ trace_call__contended_release(rwb);
__rwbase_write_unlock(rwb, WRITER_BIAS, flags);
}
@@ -214,6 +218,8 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
unsigned long flags;
raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (trace_contended_release_enabled() && rt_mutex_has_waiters(rtm))
+ trace_call__contended_release(rwb);
/* Release it and account current as reader */
__rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index bf647097369c..b9c180ac1eee 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1387,6 +1387,8 @@ static inline void __up_read(struct rw_semaphore *sem)
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
+ if (trace_contended_release_enabled() && (tmp & RWSEM_FLAG_WAITERS))
+ trace_call__contended_release(sem);
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
RWSEM_FLAG_WAITERS)) {
clear_nonspinnable(sem);
@@ -1413,8 +1415,10 @@ static inline void __up_write(struct rw_semaphore *sem)
preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
- if (unlikely(tmp & RWSEM_FLAG_WAITERS))
+ if (unlikely(tmp & RWSEM_FLAG_WAITERS)) {
+ trace_contended_release(sem);
rwsem_wake(sem);
+ }
preempt_enable();
}
@@ -1437,8 +1441,10 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
- if (tmp & RWSEM_FLAG_WAITERS)
+ if (tmp & RWSEM_FLAG_WAITERS) {
+ trace_contended_release(sem);
rwsem_downgrade_wake(sem);
+ }
preempt_enable();
}
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 74d41433ba13..233730c25933 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -230,6 +230,10 @@ void __sched up(struct semaphore *sem)
sem->count++;
else
__up(sem, &wake_q);
+
+ if (trace_contended_release_enabled() && !wake_q_empty(&wake_q))
+ trace_call__contended_release(sem);
+
raw_spin_unlock_irqrestore(&sem->lock, flags);
if (!wake_q_empty(&wake_q))
wake_up_q(&wake_q);
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 6c12452097e1..d62b49b53ec3 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -324,7 +324,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
* blocked_on to PROXY_WAKING. Otherwise we can see
* circular blocked_on relationships that can't resolve.
*/
- set_task_blocked_on_waking(waiter->task, lock);
+ clear_task_blocked_on(waiter->task, lock);
wake_q_add(wake_q, waiter->task);
}
@@ -383,7 +383,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
* are waking the mutex owner, who may be currently
* blocked on a different mutex.
*/
- set_task_blocked_on_waking(owner, NULL);
+ clear_task_blocked_on(owner, NULL);
wake_q_add(wake_q, owner);
}
return true;
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 36f52a232a12..cce098671be9 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -307,6 +307,8 @@ int module_decompress(struct load_info *info, const void *buf, size_t size)
*/
n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2;
error = module_extend_max_pages(info, n_pages);
+ if (error)
+ return error;
data_size = MODULE_DECOMPRESS_FN(info, buf, size);
if (data_size < 0) {
diff --git a/kernel/panic.c b/kernel/panic.c
index 20feada5319d..213725b612aa 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -39,6 +39,7 @@
#include <linux/sys_info.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
+#include <kunit/test-bug.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -1124,6 +1125,11 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
bool rcu = warn_rcu_enter();
struct warn_args args;
+ if (kunit_is_suppressed_warning(true)) {
+ warn_rcu_exit(rcu);
+ return;
+ }
+
pr_warn(CUT_HERE);
if (!fmt) {
@@ -1146,6 +1152,11 @@ void __warn_printk(const char *fmt, ...)
bool rcu = warn_rcu_enter();
va_list args;
+ if (kunit_is_suppressed_warning(false)) {
+ warn_rcu_exit(rcu);
+ return;
+ }
+
pr_warn(CUT_HERE);
va_start(args, fmt);
diff --git a/kernel/params.c b/kernel/params.c
index 74d620bc2521..a668863a4bb6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -942,9 +942,9 @@ const struct kobj_type module_ktype = {
/*
* param_sysfs_init - create "module" kset
*
- * This must be done before the initramfs is unpacked and
- * request_module() thus becomes possible, because otherwise the
- * module load would fail in mod_sysfs_init.
+ * This must be done before any driver registration so that when a driver comes
+ * from a built-in module, the driver core can add the module under /sys/module
+ * and create the associated driver symlinks.
*/
static int __init param_sysfs_init(void)
{
@@ -957,7 +957,7 @@ static int __init param_sysfs_init(void)
return 0;
}
-subsys_initcall(param_sysfs_init);
+pure_initcall(param_sysfs_init);
/*
* param_sysfs_builtin_init - add sysfs version and parameter
diff --git a/kernel/pid.c b/kernel/pid.c
index fd5c2d4aa349..f55189a3d07d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -885,10 +885,12 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
if (ret)
return ERR_PTR(ret);
- if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
- file = fget_task(task, fd);
- else
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
file = ERR_PTR(-EPERM);
+ else if (task->flags & PF_EXITING)
+ file = ERR_PTR(-ESRCH);
+ else
+ file = fget_task(task, fd);
up_read(&task->signal->exec_update_lock);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 05337f437cca..530c897311d4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -42,6 +42,7 @@ config HIBERNATION
select CRC32
select CRYPTO
select CRYPTO_LZO
+ select CRYPTO_LZ4
help
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index af8d07bafe02..d2479c69d71a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -392,23 +392,6 @@ static int create_image(int platform_mode)
return error;
}
-static void shrink_shmem_memory(void)
-{
- struct sysinfo info;
- unsigned long nr_shmem_pages, nr_freed_pages;
-
- si_meminfo(&info);
- nr_shmem_pages = info.sharedram; /* current page count used for shmem */
- /*
- * The intent is to reclaim all shmem pages. Though shrink_all_memory() can
- * only reclaim about half of them, it's enough for creating the hibernation
- * image.
- */
- nr_freed_pages = shrink_all_memory(nr_shmem_pages);
- pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n",
- nr_shmem_pages, nr_freed_pages);
-}
-
/**
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
@@ -425,14 +408,9 @@ int hibernation_snapshot(int platform_mode)
if (error)
goto Close;
- /* Preallocate image memory before shutting down devices. */
- error = hibernate_preallocate_memory();
- if (error)
- goto Close;
-
error = freeze_kernel_threads();
if (error)
- goto Cleanup;
+ goto Close;
if (hibernation_test(TEST_FREEZER)) {
@@ -445,19 +423,13 @@ int hibernation_snapshot(int platform_mode)
}
error = dpm_prepare(PMSG_FREEZE);
- if (error) {
- dpm_complete(PMSG_RECOVER);
- goto Thaw;
- }
+ if (error)
+ goto Complete;
- /*
- * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem
- * pages will use lots of system memory, causing hibernation image creation
- * fail due to insufficient free memory.
- * This call is to force flush the shmem pages to swap disk and reclaim
- * the system memory so that image creation can succeed.
- */
- shrink_shmem_memory();
+ /* Preallocate image memory before shutting down devices. */
+ error = hibernate_preallocate_memory();
+ if (error)
+ goto Complete;
console_suspend_all();
pm_restrict_gfp_mask();
@@ -492,10 +464,10 @@ int hibernation_snapshot(int platform_mode)
platform_end(platform_mode);
return error;
+ Complete:
+ dpm_complete(PMSG_RECOVER);
Thaw:
thaw_kernel_threads();
- Cleanup:
- swsusp_free();
goto Close;
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 398b994b73aa..1944dbeb0d4c 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -519,18 +519,23 @@ static int __init cpu_latency_qos_init(void)
int ret;
ret = misc_register(&cpu_latency_qos_miscdev);
- if (ret < 0)
+ if (ret < 0) {
pr_err("%s: %s setup failed\n", __func__,
cpu_latency_qos_miscdev.name);
+ return ret;
+ }
#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
ret = misc_register(&cpu_wakeup_latency_qos_miscdev);
- if (ret < 0)
+ if (ret < 0) {
pr_err("%s: %s setup failed\n", __func__,
cpu_wakeup_latency_qos_miscdev.name);
+ misc_deregister(&cpu_latency_qos_miscdev);
+ return ret;
+ }
#endif
- return ret;
+ return 0;
}
late_initcall(cpu_latency_qos_init);
#endif /* CONFIG_CPU_IDLE */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2e64869bb5a0..b28233b8d00e 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -570,29 +570,23 @@ struct crc_data {
wait_queue_head_t done; /* crc update done */
u32 *crc32; /* points to handle's crc32 */
size_t **unc_len; /* uncompressed lengths */
- unsigned char **unc; /* uncompressed data */
+ unsigned char *unc[]; /* uncompressed data */
};
static struct crc_data *alloc_crc_data(int nr_threads)
{
struct crc_data *crc;
- crc = kzalloc_obj(*crc);
+ crc = kzalloc_flex(*crc, unc, nr_threads);
if (!crc)
return NULL;
- crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL);
- if (!crc->unc)
- goto err_free_crc;
-
crc->unc_len = kzalloc_objs(*crc->unc_len, nr_threads);
if (!crc->unc_len)
- goto err_free_unc;
+ goto err_free_crc;
return crc;
-err_free_unc:
- kfree(crc->unc);
err_free_crc:
kfree(crc);
return NULL;
@@ -607,7 +601,6 @@ static void free_crc_data(struct crc_data *crc)
kthread_stop(crc->thr);
kfree(crc->unc_len);
- kfree(crc->unc);
kfree(crc);
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 68c17daef8d4..d041645d9d17 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
+#include <linux/sched/exec_state.h>
#include <linux/sched/task.h>
#include <linux/errno.h>
#include <linux/mm.h>
@@ -36,6 +37,30 @@
#include <asm/syscall.h> /* for syscall_get_* */
+/**
+ * ptracer_access_allowed - may current peek/poke @tsk's address space?
+ * @tsk: tracee
+ *
+ * Per-access check used by ptrace_access_vm() and architecture-specific
+ * tag/register accessors. Returns true iff current is the registered
+ * ptracer of @tsk and either @tsk is owner-dumpable or current holds
+ * CAP_SYS_PTRACE in @tsk's exec namespace. Lighter than
+ * __ptrace_may_access(): it re-validates only dumpability and
+ * capability on every access, without re-running LSM hooks or
+ * cred_cap_issubset() checks performed at attach time.
+ */
+bool ptracer_access_allowed(struct task_struct *tsk)
+{
+ const struct task_exec_state *es;
+
+ guard(rcu)();
+ if (ptrace_parent(tsk) != current)
+ return false;
+ es = task_exec_state_rcu(tsk);
+ return READ_ONCE(es->dumpable) == TASK_DUMPABLE_OWNER ||
+ ptracer_capable(tsk, es->user_ns);
+}
+
/*
* Access another process' address space via ptrace.
* Source/target buffer must be kernel space,
@@ -45,21 +70,14 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
struct mm_struct *mm;
- int ret;
+ int ret = 0;
mm = get_task_mm(tsk);
if (!mm)
return 0;
- if (!tsk->ptrace ||
- (current != tsk->parent) ||
- ((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptracer_capable(tsk, mm->user_ns))) {
- mmput(mm);
- return 0;
- }
-
- ret = access_remote_vm(mm, addr, buf, len, gup_flags);
+ if (ptracer_access_allowed(tsk))
+ ret = access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
return ret;
@@ -272,11 +290,21 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
return ns_capable(ns, CAP_SYS_PTRACE);
}
+static bool task_still_dumpable(struct task_struct *task, unsigned int mode)
+{
+ const struct task_exec_state *exec_state;
+
+ guard(rcu)();
+ exec_state = task_exec_state_rcu(task);
+ if (READ_ONCE(exec_state->dumpable) == TASK_DUMPABLE_OWNER)
+ return true;
+ return ptrace_has_cap(exec_state->user_ns, mode);
+}
+
/* Returns 0 on success, -errno on denial. */
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
- struct mm_struct *mm;
kuid_t caller_uid;
kgid_t caller_gid;
@@ -337,11 +365,8 @@ ok:
* Pairs with a write barrier in commit_creds().
*/
smp_rmb();
- mm = task->mm;
- if (mm &&
- ((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(mm->user_ns, mode)))
- return -EPERM;
+ if (!task_still_dumpable(task, mode))
+ return -EPERM;
return security_ptrace_access_check(task, mode);
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 5f2848b828dc..882a158ada7b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -572,7 +572,7 @@ static unsigned long rcu_no_completed(void)
static void rcu_torture_deferred_free(struct rcu_torture *p)
{
- call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb);
+ call_rcu(&p->rtort_rcu, rcu_torture_cb);
}
static void rcu_sync_torture_init(void)
@@ -619,7 +619,7 @@ static struct rcu_torture_ops rcu_ops = {
.poll_gp_state_exp = poll_state_synchronize_rcu,
.cond_sync_exp = cond_synchronize_rcu_expedited,
.cond_sync_exp_full = cond_synchronize_rcu_expedited_full,
- .call = call_rcu_hurry,
+ .call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.gp_kthread_dbg = show_rcu_gp_kthreads,
@@ -1145,7 +1145,7 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
static void synchronize_rcu_mult_test(void)
{
- synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry);
+ synchronize_rcu_mult(call_rcu_tasks, call_rcu);
}
static struct rcu_torture_ops tasks_ops = {
@@ -1632,6 +1632,17 @@ static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
}
/*
+ * Do an rcu_barrier() to motivate lazy callbacks during a stutter
+ * pause. Without this, we can get false-positives rtort_pipe_count
+ * splats.
+ */
+static void rcu_torture_writer_work(struct work_struct *work)
+{
+ if (cur_ops->cb_barrier)
+ cur_ops->cb_barrier();
+}
+
+/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
* after a series of grace periods (the "pipeline").
@@ -1651,6 +1662,7 @@ rcu_torture_writer(void *arg)
int i;
int idx;
unsigned long j;
+ struct work_struct lazy_work;
int oldnice = task_nice(current);
struct rcu_gp_oldstate *rgo = NULL;
int rgo_size = 0;
@@ -1703,6 +1715,9 @@ rcu_torture_writer(void *arg)
pr_alert("%s" TORTURE_FLAG " Waited %lu jiffies for boot to complete.\n",
torture_type, jiffies - j);
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ INIT_WORK_ONSTACK(&lazy_work, rcu_torture_writer_work);
+
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
torture_hrtimeout_us(500, 1000, &rand);
@@ -1895,6 +1910,8 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ queue_work(system_percpu_wq, &lazy_work);
stutter_waited = stutter_wait("rcu_torture_writer");
if (stutter_waited &&
!atomic_read(&rcu_fwd_cb_nodelay) &&
@@ -1925,6 +1942,12 @@ rcu_torture_writer(void *arg)
pr_alert("%s" TORTURE_FLAG
" Dynamic grace-period expediting was disabled.\n",
torture_type);
+
+ if (IS_ENABLED(CONFIG_RCU_LAZY)) {
+ cancel_work_sync(&lazy_work);
+ destroy_work_on_stack(&lazy_work);
+ }
+
kfree(ulo);
kfree(rgo);
rcu_torture_writer_state = RTWS_STOPPING;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0d01cd8c4b4a..7c2f7cc131f7 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -897,11 +897,9 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
{
int cpu;
- for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- if (!(mask & (1UL << (cpu - snp->grplo))))
- continue;
- srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
- }
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
+ if ((mask & (1UL << (cpu - snp->grplo))) && rcu_cpu_beenfullyonline(cpu))
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
}
/*
@@ -1322,7 +1320,9 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
*/
idx = __srcu_read_lock_nmisafe(ssp);
ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
- if (ss_state < SRCU_SIZE_WAIT_CALL)
+ // If !rcu_cpu_beenfullyonline(), interrupts are still disabled,
+ // so no migration is possible in either direction from this CPU.
+ if (ss_state < SRCU_SIZE_WAIT_CALL || !rcu_cpu_beenfullyonline(raw_smp_processor_id()))
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = raw_cpu_ptr(ssp->sda);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 48f0d803c8e2..f4da5fad70f5 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -373,7 +373,8 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
// Queuing callbacks before initialization not yet supported.
if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist)))
rcu_segcblist_init(&rtpcp->cblist);
- needwake = (func == wakeme_after_rcu) ||
+ needwake = (!havekthread && rcu_segcblist_empty(&rtpcp->cblist)) ||
+ (func == wakeme_after_rcu) ||
(rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim);
if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) {
if (rtp->lazy_jiffies)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 55df6d37145e..03a43d3d2616 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -492,7 +492,7 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param
int ret = kstrtoul(val, 0, &j);
if (!ret) {
- WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1));
+ WRITE_ONCE(*(ulong *)kp->arg, clamp_val(j, 1, HZ));
adjust_jiffies_till_sched_qs();
}
return ret;
@@ -969,14 +969,11 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp)
if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
int cpu = rdp->cpu;
struct rcu_snap_record *rsrp;
- struct kernel_cpustat *kcsp;
-
- kcsp = &kcpustat_cpu(cpu);
rsrp = &rdp->snap_record;
- rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
- rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
- rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsrp->cputime_irq = kcpustat_field(CPUTIME_IRQ, cpu);
+ rsrp->cputime_softirq = kcpustat_field(CPUTIME_SOFTIRQ, cpu);
+ rsrp->cputime_system = kcpustat_field(CPUTIME_SYSTEM, cpu);
rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu);
rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu);
rsrp->nr_csw = nr_context_switches_cpu(cpu);
@@ -1632,17 +1629,21 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
atomic_set_release(&sr_wn->inuse, 0);
}
-/* Enable rcu_normal_wake_from_gp automatically on small systems. */
-#define WAKE_FROM_GP_CPU_THRESHOLD 16
-
-static int rcu_normal_wake_from_gp = -1;
+static int rcu_normal_wake_from_gp = 1;
module_param(rcu_normal_wake_from_gp, int, 0644);
static struct workqueue_struct *sync_wq;
+#define RCU_SR_NORMAL_LATCH_THR 64
+
+/* Number of in-flight synchronize_rcu() calls queued on srs_next. */
+static atomic_long_t rcu_sr_normal_count;
+static int rcu_sr_normal_latched; /* 0/1 */
+
static void rcu_sr_normal_complete(struct llist_node *node)
{
struct rcu_synchronize *rs = container_of(
(struct rcu_head *) node, struct rcu_synchronize, head);
+ long nr;
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
!poll_state_synchronize_rcu_full(&rs->oldstate),
@@ -1650,6 +1651,15 @@ static void rcu_sr_normal_complete(struct llist_node *node)
/* Finally. */
complete(&rs->completion);
+ nr = atomic_long_dec_return(&rcu_sr_normal_count);
+ WARN_ON_ONCE(nr < 0);
+
+ /*
+ * Unlatch: switch back to normal path when fully
+ * drained and if it has been latched.
+ */
+ if (nr == 0)
+ (void)cmpxchg_relaxed(&rcu_sr_normal_latched, 1, 0);
}
static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
@@ -1795,6 +1805,24 @@ static bool rcu_sr_normal_gp_init(void)
static void rcu_sr_normal_add_req(struct rcu_synchronize *rs)
{
+ /*
+ * Increment before publish to avoid a complete
+ * vs enqueue race on latch.
+ */
+ long nr = atomic_long_inc_return(&rcu_sr_normal_count);
+
+ /*
+ * Latch when threshold is reached. Checking for an exact match
+ * restricts cmpxchg() to a single context.
+ *
+ * This latch is intentionally relaxed and best-effort. Concurrent
+ * set/clear can race and temporarily lose the latch, which is OK
+ * because it only selects between the fast and fallback paths.
+ */
+ if (nr == RCU_SR_NORMAL_LATCH_THR)
+ (void)cmpxchg_relaxed(&rcu_sr_normal_latched, 0, 1);
+
+ /* Publish for the GP kthread/worker. */
llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next);
}
@@ -2584,7 +2612,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
const long npj = NSEC_PER_SEC / HZ;
long rrn = READ_ONCE(rcu_resched_ns);
- rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
+ rrn = clamp(rrn, NSEC_PER_MSEC, NSEC_PER_SEC);
tlimit = local_clock() + rrn;
jlimit = jiffies + (rrn + npj + 1) / npj;
jlimit_check = true;
@@ -3278,14 +3306,15 @@ static void synchronize_rcu_normal(void)
{
struct rcu_synchronize rs;
+ init_rcu_head_on_stack(&rs.head);
trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
- if (READ_ONCE(rcu_normal_wake_from_gp) < 1) {
+ if (READ_ONCE(rcu_normal_wake_from_gp) < 1 ||
+ READ_ONCE(rcu_sr_normal_latched)) {
wait_rcu_gp(call_rcu_hurry);
goto trace_complete_out;
}
- init_rcu_head_on_stack(&rs.head);
init_completion(&rs.completion);
/*
@@ -3302,10 +3331,10 @@ static void synchronize_rcu_normal(void)
/* Now we can wait. */
wait_for_completion(&rs.completion);
- destroy_rcu_head_on_stack(&rs.head);
trace_complete_out:
trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete"));
+ destroy_rcu_head_on_stack(&rs.head);
}
/**
@@ -4904,12 +4933,6 @@ void __init rcu_init(void)
sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
WARN_ON(!sync_wq);
- /* Respect if explicitly disabled via a boot parameter. */
- if (rcu_normal_wake_from_gp < 0) {
- if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD)
- rcu_normal_wake_from_gp = 1;
- }
-
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
if (qovld < 0)
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 1047b30cd46b..373b877cf171 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -655,7 +655,7 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
* No-CBs GP kthreads come here to wait for additional callbacks to show up
* or for grace periods to end.
*/
-static void nocb_gp_wait(struct rcu_data *my_rdp)
+static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
{
bool bypass = false;
int __maybe_unused cpu = my_rdp->cpu;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b67532cb8770..cf7ae51cba40 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -479,7 +479,6 @@ static void print_cpu_stat_info(int cpu)
{
struct rcu_snap_record rsr, *rsrp;
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu);
if (!rcu_cpu_stall_cputime)
return;
@@ -488,9 +487,9 @@ static void print_cpu_stat_info(int cpu)
if (rsrp->gp_seq != rdp->gp_seq)
return;
- rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
- rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
- rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsr.cputime_irq = kcpustat_field(CPUTIME_IRQ, cpu);
+ rsr.cputime_softirq = kcpustat_field(CPUTIME_SOFTIRQ, cpu);
+ rsr.cputime_system = kcpustat_field(CPUTIME_SYSTEM, cpu);
pr_err("\t hardirqs softirqs csw/system\n");
pr_err("\t number: %8lld %10d %12lld\n",
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 38d3ef540760..e75e3a5e312c 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -236,11 +236,6 @@ static int __init rseq_debugfs_init(void)
}
__initcall(rseq_debugfs_init);
-static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
-{
- return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
-}
-
static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
{
struct rseq __user *urseq = t->rseq.usrptr;
@@ -258,14 +253,16 @@ efault:
static void rseq_slowpath_update_usr(struct pt_regs *regs)
{
/*
- * Preserve rseq state and user_irq state. The generic entry code
- * clears user_irq on the way out, the non-generic entry
- * architectures are not having user_irq.
+ * Preserve has_rseq and user_irq state. The generic entry code clears
+ * user_irq on the way out, the non-generic entry architectures are not
+ * setting user_irq.
*/
- const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
+ const struct rseq_event evt_mask = {
+ .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK,
+ .user_irq = true,
+ };
struct task_struct *t = current;
struct rseq_ids ids;
- u32 node_id;
bool event;
if (unlikely(t->flags & PF_EXITING))
@@ -301,9 +298,9 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs)
if (!event)
return;
- node_id = cpu_to_node(ids.cpu_id);
+ ids.node_id = cpu_to_node(ids.cpu_id);
- if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+ if (unlikely(!rseq_update_usr(t, regs, &ids))) {
/*
* Clear the errors just in case this might survive magically, but
* leave the rest intact.
@@ -335,8 +332,9 @@ void __rseq_handle_slowpath(struct pt_regs *regs)
void __rseq_signal_deliver(int sig, struct pt_regs *regs)
{
rseq_stat_inc(rseq_stats.signal);
+
/*
- * Don't update IDs, they are handled on exit to user if
+ * Don't update IDs yet, they are handled on exit to user if
* necessary. The important thing is to abort a critical section of
* the interrupted context as after this point the instruction
* pointer in @regs points to the signal handler.
@@ -349,6 +347,13 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs)
current->rseq.event.error = 0;
force_sigsegv(sig);
}
+
+ /*
+ * In legacy mode, force the update of IDs before returning to user
+ * space to stay compatible.
+ */
+ if (!rseq_v2(current))
+ rseq_force_update();
}
/*
@@ -384,19 +389,22 @@ void rseq_syscall(struct pt_regs *regs)
static bool rseq_reset_ids(void)
{
- struct rseq_ids ids = {
- .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
- .mm_cid = 0,
- };
+ struct rseq __user *rseq = current->rseq.usrptr;
/*
* If this fails, terminate it because this leaves the kernel in
* stupid state as exit to user space will try to fixup the ids
* again.
*/
- if (rseq_set_ids(current, &ids, 0))
- return true;
+ scoped_user_rw_access(rseq, efault) {
+ unsafe_put_user(0, &rseq->cpu_id_start, efault);
+ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
+ unsafe_put_user(0, &rseq->node_id, efault);
+ unsafe_put_user(0, &rseq->mm_cid, efault);
+ }
+ return true;
+efault:
force_sig(SIGSEGV);
return false;
}
@@ -404,70 +412,29 @@ static bool rseq_reset_ids(void)
/* The original rseq structure size (including padding) is 32 bytes. */
#define ORIG_RSEQ_SIZE 32
-/*
- * sys_rseq - setup restartable sequences for caller thread.
- */
-SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
+static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig)
{
u32 rseqfl = 0;
+ u8 version = 1;
- if (flags & RSEQ_FLAG_UNREGISTER) {
- if (flags & ~RSEQ_FLAG_UNREGISTER)
- return -EINVAL;
- /* Unregister rseq for current thread. */
- if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
- return -EINVAL;
- if (rseq_len != current->rseq.len)
- return -EINVAL;
- if (current->rseq.sig != sig)
- return -EPERM;
- if (!rseq_reset_ids())
- return -EFAULT;
- rseq_reset(current);
- return 0;
- }
-
- if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)))
- return -EINVAL;
-
- if (current->rseq.usrptr) {
- /*
- * If rseq is already registered, check whether
- * the provided address differs from the prior
- * one.
- */
- if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
- return -EINVAL;
- if (current->rseq.sig != sig)
- return -EPERM;
- /* Already registered. */
- return -EBUSY;
- }
-
- /*
- * If there was no rseq previously registered, ensure the provided rseq
- * is properly aligned, as communcated to user-space through the ELF
- * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
- * size, the required alignment is the original struct rseq alignment.
- *
- * The rseq_len is required to be greater or equal to the original rseq
- * size. In order to be valid, rseq_len is either the original rseq size,
- * or large enough to contain all supported fields, as communicated to
- * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
- */
- if (rseq_len < ORIG_RSEQ_SIZE ||
- (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
- (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) ||
- rseq_len < offsetof(struct rseq, end))))
- return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
- if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
- rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
- if (rseq_slice_extension_enabled() &&
- (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))
- rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ /*
+ * Architectures, which use the generic IRQ entry code (at least) enable
+ * registrations with a size greater than the original v1 fixed sized
+ * @rseq_len, which has been validated already to utilize the optimized
+ * v2 ABI mode which also enables extended RSEQ features beyond MMCID.
+ */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE)
+ version = 2;
+
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) {
+ if (rseq_slice_extension_enabled()) {
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ }
}
scoped_user_write_access(rseq, efault) {
@@ -485,7 +452,15 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
unsafe_put_user(0U, &rseq->node_id, efault);
unsafe_put_user(0U, &rseq->mm_cid, efault);
- unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+
+ /*
+ * All fields past mm_cid are only valid for non-legacy v2
+ * registrations.
+ */
+ if (version > 1) {
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+ }
}
/*
@@ -501,11 +476,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
#endif
/*
- * If rseq was previously inactive, and has just been
- * registered, ensure the cpu_id_start and cpu_id fields
- * are updated before returning to user-space.
+ * Ensure the cpu_id_start and cpu_id fields are updated before
+ * returning to user-space.
*/
- current->rseq.event.has_rseq = true;
+ current->rseq.event.has_rseq = version;
rseq_force_update();
return 0;
@@ -513,6 +487,80 @@ efault:
return -EFAULT;
}
+static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig)
+{
+ if (flags & ~RSEQ_FLAG_UNREGISTER)
+ return -EINVAL;
+ if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
+ return -EINVAL;
+ if (rseq_len != current->rseq.len)
+ return -EINVAL;
+ if (current->rseq.sig != sig)
+ return -EPERM;
+ if (!rseq_reset_ids())
+ return -EFAULT;
+ rseq_reset(current);
+ return 0;
+}
+
+static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig)
+{
+ /*
+ * If rseq is already registered, check whether the provided address
+ * differs from the prior one.
+ */
+ if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
+ return -EINVAL;
+ if (current->rseq.sig != sig)
+ return -EPERM;
+ /* Already registered. */
+ return -EBUSY;
+}
+
+static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len)
+{
+ /*
+ * Ensure the provided rseq is properly aligned, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If
+ * rseq_len is the original rseq size, the required alignment is the
+ * original struct rseq alignment.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
+ */
+ if (rseq_len < ORIG_RSEQ_SIZE)
+ return false;
+
+ if (rseq_len == ORIG_RSEQ_SIZE)
+ return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE);
+
+ return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) &&
+ rseq_len >= offsetof(struct rseq, end);
+}
+
+#define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)
+
+/*
+ * sys_rseq - Register or unregister restartable sequences for the caller thread.
+ */
+SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
+{
+ if (flags & RSEQ_FLAG_UNREGISTER)
+ return rseq_unregister(rseq, rseq_len, flags, sig);
+
+ if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED))
+ return -EINVAL;
+
+ if (current->rseq.usrptr)
+ return rseq_reregister(rseq, rseq_len, sig);
+
+ if (!rseq_length_valid(rseq, rseq_len))
+ return -EINVAL;
+
+ return rseq_register(rseq, rseq_len, flags, sig);
+}
+
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
struct slice_timer {
struct hrtimer timer;
@@ -713,6 +761,8 @@ int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
return -ENOTSUPP;
if (!current->rseq.usrptr)
return -ENXIO;
+ if (!rseq_v2(current))
+ return -ENOTSUPP;
/* No change? */
if (enable == !!current->rseq.slice.state.enabled)
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 755883faf751..067979a7b69e 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -58,8 +58,17 @@
#include "deadline.c"
#ifdef CONFIG_SCHED_CLASS_EXT
+# include <linux/btf_ids.h>
+# include <linux/find.h>
+# include <linux/genalloc.h>
+# include "ext_types.h"
# include "ext_internal.h"
+# include "ext_cid.h"
+# include "ext_arena.h"
+# include "ext_idle.h"
# include "ext.c"
+# include "ext_cid.c"
+# include "ext_arena.c"
# include "ext_idle.c"
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b8871449d3c6..8b791e9e9f67 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -537,13 +537,22 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
/* need a wrapper since we may need to trace from modules */
EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
-/* Call via the helper macro trace_set_current_state. */
+/*
+ * Call via the helper macro trace_set_current_state.
+ * Calls to this function MUST be guarded by a
+ * tracepoint_enabled(sched_set_state_tp)
+ */
void __trace_set_current_state(int state_value)
{
- trace_sched_set_state_tp(current, state_value);
+ trace_call__sched_set_state_tp(current, state_value);
}
EXPORT_SYMBOL(__trace_set_current_state);
+int task_llc(const struct task_struct *p)
+{
+ return per_cpu(sd_llc_id, task_cpu(p));
+}
+
/*
* Serialization rules:
*
@@ -615,6 +624,12 @@ EXPORT_SYMBOL(__trace_set_current_state);
* [ The astute reader will observe that it is possible for two tasks on one
* CPU to have ->on_cpu = 1 at the same time. ]
*
+ * p->is_blocked <- { 0, 1 }:
+ *
+ * is set by try_to_block_task() and cleared by ttwu_do_wakeup() and tracks
+ * if the task is blocked. Traditionally this would mirror p->on_rq, however
+ * due things like DELAY_DEQUEUE and PROXY_EXEC, this can diverge.
+ *
* task_cpu(p): is changed by set_task_cpu(), the rules are:
*
* - Don't call set_task_cpu() on a blocked task:
@@ -1203,9 +1218,13 @@ static void __resched_curr(struct rq *rq, int tif)
}
}
+/*
+ * Calls to this function MUST be guarded by a
+ * tracepoint_enabled(sched_set_need_resched_tp)
+ */
void __trace_set_need_resched(struct task_struct *curr, int tif)
{
- trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
+ trace_call__sched_set_need_resched_tp(curr, smp_processor_id(), tif);
}
EXPORT_SYMBOL_GPL(__trace_set_need_resched);
@@ -2223,8 +2242,29 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-static void block_task(struct rq *rq, struct task_struct *p, int flags)
+static void block_task(struct rq *rq, struct task_struct *p, unsigned long task_state)
{
+ int flags = DEQUEUE_NOCLOCK;
+
+ p->sched_contributes_to_load =
+ (task_state & TASK_UNINTERRUPTIBLE) &&
+ !(task_state & TASK_NOLOAD) &&
+ !(task_state & TASK_FROZEN);
+
+ if (unlikely(is_special_task_state(task_state)))
+ flags |= DEQUEUE_SPECIAL;
+
+ /*
+ * __schedule() ttwu()
+ * prev_state = prev->state; if (p->on_rq && ...)
+ * if (prev_state) goto out;
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
+ * p->state = TASK_WAKING
+ *
+ * Where __schedule() and ttwu() have matching control dependencies.
+ *
+ * After this, schedule() must not care about p->state any more.
+ */
if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
__block_task(rq, p);
}
@@ -3685,6 +3725,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
*/
static inline void ttwu_do_wakeup(struct task_struct *p)
{
+ p->is_blocked = 0;
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
}
@@ -3701,6 +3742,65 @@ void update_rq_avg_idle(struct rq *rq)
rq->idle_stamp = 0;
}
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static void zap_balance_callbacks(struct rq *rq);
+
+static inline void proxy_reset_donor(struct rq *rq)
+{
+ WARN_ON_ONCE(rq->donor == rq->curr);
+
+ put_prev_set_next_task(rq, rq->donor, rq->curr);
+ rq_set_donor(rq, rq->curr);
+ zap_balance_callbacks(rq);
+ resched_curr(rq);
+}
+
+/*
+ * Checks to see if task p has been proxy-migrated to another rq
+ * and needs to be returned. If so, we deactivate the task here
+ * so that it can be properly woken up on the p->wake_cpu
+ * (or whichever cpu select_task_rq() picks at the bottom of
+ * try_to_wake_up()
+ */
+static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * Typically per __set_task_cpu(), task_cpu(p) == p->wake_cpu.
+ *
+ * However, proxy_set_task_cpu() is such that it preserves the
+ * original cpu in p->wake_cpu while migrating p for proxy reasons
+ * (possibly outside of the allowed p->cpus_ptr).
+ *
+ * Furthermore, migration_cpu_stop() / __migrate_swap_task(), will
+ * only set p->wake_cpu when !p->on_rq, and since here p->on_rq, this
+ * will not apply. But if it did, this check is the safe way around
+ * and would migrate.
+ */
+ if (task_cpu(p) == p->wake_cpu)
+ return false;
+
+ scoped_guard(raw_spinlock, &p->blocked_lock) {
+ /* Task is waking up; clear any blocked_on relationship */
+ __clear_task_blocked_on(p, NULL);
+
+ /* If already current, don't need to return migrate */
+ if (task_current(rq, p))
+ return false;
+
+ /* If we're return migrating the rq->donor, switch it out for idle */
+ if (task_current_donor(rq, p))
+ proxy_reset_donor(rq);
+ }
+ block_task(rq, p, TASK_WAKING);
+ return true;
+}
+#else /* !CONFIG_SCHED_PROXY_EXEC */
+static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_PROXY_EXEC */
+
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
@@ -3716,8 +3816,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
en_flags |= ENQUEUE_RQ_SELECTED;
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
- else
- if (p->in_iowait) {
+ else if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -3765,28 +3864,28 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
*/
static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
- struct rq_flags rf;
- struct rq *rq;
- int ret = 0;
+ ACQUIRE(__task_rq_lock, guard)(p);
+ struct rq *rq = guard.rq;
- rq = __task_rq_lock(p, &rf);
- if (task_on_rq_queued(p)) {
- update_rq_clock(rq);
+ if (!task_on_rq_queued(p))
+ return 0;
+
+ update_rq_clock(rq);
+ if (p->is_blocked) {
if (p->se.sched_delayed)
enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
- if (!task_on_cpu(rq, p)) {
- /*
- * When on_rq && !on_cpu the task is preempted, see if
- * it should preempt the task that is current now.
- */
- wakeup_preempt(rq, p, wake_flags);
- }
- ttwu_do_wakeup(p);
- ret = 1;
+ if (proxy_needs_return(rq, p))
+ return 0;
}
- __task_rq_unlock(rq, p, &rf);
-
- return ret;
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ wakeup_preempt(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
+ return 1;
}
void sched_ttwu_pending(void *arg)
@@ -4173,6 +4272,9 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* it disabling IRQs (this allows not taking ->pi_lock).
*/
WARN_ON_ONCE(p->se.sched_delayed);
+ WARN_ON_ONCE(p->is_blocked);
+ /* If p is current, we know we can run here, so clear blocked_on */
+ clear_task_blocked_on(p, NULL);
if (!ttwu_state_match(p, state, &success))
goto out;
@@ -4189,6 +4291,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
smp_mb__after_spinlock();
+
if (!ttwu_state_match(p, state, &success))
break;
@@ -4297,6 +4400,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
+ } else if (cpu != p->wake_cpu) {
+ /*
+ * If we were proxy-migrated to cpu, then
+ * select_task_rq() picks cpu instead of wake_cpu
+ * to return to, we won't call set_task_cpu(),
+ * leaving a stale wake_cpu pointing to where we
+ * proxy-migrated from. So just fixup wake_cpu here
+ * if its not correct
+ */
+ p->wake_cpu = cpu;
}
ttwu_queue(p, cpu, wake_flags);
@@ -4463,6 +4576,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
/* A delayed task cannot be in clone(). */
WARN_ON_ONCE(p->se.sched_delayed);
+ WARN_ON_ONCE(p->is_blocked);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -4498,6 +4612,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
+ init_sched_mm(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4710,6 +4825,7 @@ int sched_fork(u64 clone_flags, struct task_struct *p)
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
+ p->timer_slack_ns = p->default_timer_slack_ns;
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
@@ -5518,7 +5634,11 @@ void sched_exec(void)
}
DEFINE_PER_CPU(struct kernel_stat, kstat);
-DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat) = {
+#ifdef CONFIG_NO_HZ_COMMON
+ .idle_sleeptime_seq = SEQCNT_ZERO(kernel_cpustat.idle_sleeptime_seq)
+#endif
+};
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
@@ -5972,10 +6092,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
schedstat_inc(this_rq()->sched_count);
}
-static void prev_balance(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
+static void prev_balance(struct rq *rq, struct rq_flags *rf)
{
- const struct sched_class *start_class = prev->sched_class;
+ const struct sched_class *start_class = rq->donor->sched_class;
const struct sched_class *class;
/*
@@ -5987,7 +6106,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
* a runnable task of @class priority or higher.
*/
for_active_class_range(class, start_class, &idle_sched_class) {
- if (class->balance && class->balance(rq, prev, rf))
+ if (class->balance && class->balance(rq, rf))
break;
}
}
@@ -5996,7 +6115,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
const struct sched_class *class;
@@ -6013,40 +6132,31 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
+ if (likely(!sched_class_above(rq->donor->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_queued)) {
- p = pick_next_task_fair(rq, prev, rf);
+ p = pick_task_fair(rq, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assume the next prioritized class is idle_sched_class */
- if (!p) {
+ if (!p)
p = pick_task_idle(rq, rf);
- put_prev_set_next_task(rq, prev, p);
- }
+ put_prev_set_next_task(rq, rq->donor, p);
return p;
}
restart:
- prev_balance(rq, prev, rf);
+ prev_balance(rq, rf);
for_each_active_class(class) {
- if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev, rf);
- if (unlikely(p == RETRY_TASK))
- goto restart;
- if (p)
- return p;
- } else {
- p = class->pick_task(rq, rf);
- if (unlikely(p == RETRY_TASK))
- goto restart;
- if (p) {
- put_prev_set_next_task(rq, prev, p);
- return p;
- }
+ p = class->pick_task(rq, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
+ if (p) {
+ put_prev_set_next_task(rq, rq->donor, p);
+ return p;
}
}
@@ -6097,7 +6207,7 @@ extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_f
static void queue_core_balance(struct rq *rq);
static struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+pick_next_task(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
struct task_struct *next, *p, *max;
@@ -6110,7 +6220,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
bool need_sync;
if (!sched_core_enabled(rq))
- return __pick_next_task(rq, prev, rf);
+ return __pick_next_task(rq, rf);
cpu = cpu_of(rq);
@@ -6123,7 +6233,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
rq->core_pick = NULL;
rq->core_dl_server = NULL;
- return __pick_next_task(rq, prev, rf);
+ return __pick_next_task(rq, rf);
}
/*
@@ -6147,7 +6257,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
goto out_set_next;
}
- prev_balance(rq, prev, rf);
+ prev_balance(rq, rf);
smt_mask = cpu_smt_mask(cpu);
need_sync = !!rq->core->core_cookie;
@@ -6329,7 +6439,7 @@ restart_multi:
}
out_set_next:
- put_prev_set_next_task(rq, prev, next);
+ put_prev_set_next_task(rq, rq->donor, next);
if (rq->core->core_forceidle_count && next == rq->idle)
queue_core_balance(rq);
@@ -6552,10 +6662,10 @@ static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
static inline void sched_core_cpu_dying(unsigned int cpu) {}
static struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+pick_next_task(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
- return __pick_next_task(rq, prev, rf);
+ return __pick_next_task(rq, rf);
}
#endif /* !CONFIG_SCHED_CORE */
@@ -6583,16 +6693,19 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
unsigned long *task_state_p, bool should_block)
{
unsigned long task_state = *task_state_p;
- int flags = DEQUEUE_NOCLOCK;
+
+ WARN_ON_ONCE(p->is_blocked);
if (signal_pending_state(task_state, p)) {
WRITE_ONCE(p->__state, TASK_RUNNING);
*task_state_p = TASK_RUNNING;
- set_task_blocked_on_waking(p, NULL);
+ clear_task_blocked_on(p, NULL);
return false;
}
+ p->is_blocked = 1;
+
/*
* We check should_block after signal_pending because we
* will want to wake the task in that case. But if
@@ -6603,26 +6716,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
if (!should_block)
return false;
- p->sched_contributes_to_load =
- (task_state & TASK_UNINTERRUPTIBLE) &&
- !(task_state & TASK_NOLOAD) &&
- !(task_state & TASK_FROZEN);
-
- if (unlikely(is_special_task_state(task_state)))
- flags |= DEQUEUE_SPECIAL;
-
- /*
- * __schedule() ttwu()
- * prev_state = prev->state; if (p->on_rq && ...)
- * if (prev_state) goto out;
- * p->on_rq = 0; smp_acquire__after_ctrl_dep();
- * p->state = TASK_WAKING
- *
- * Where __schedule() and ttwu() have matching control dependencies.
- *
- * After this, schedule() must not care about p->state any more.
- */
- block_task(rq, p, flags);
+ block_task(rq, p, task_state);
return true;
}
@@ -6645,18 +6739,18 @@ static inline void proxy_set_task_cpu(struct task_struct *p, int cpu)
static inline struct task_struct *proxy_resched_idle(struct rq *rq)
{
put_prev_set_next_task(rq, rq->donor, rq->idle);
+ rq->next_class = &idle_sched_class;
rq_set_donor(rq, rq->idle);
set_tsk_need_resched(rq->idle);
return rq->idle;
}
-static bool proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static void proxy_deactivate(struct rq *rq, struct task_struct *donor)
{
unsigned long state = READ_ONCE(donor->__state);
- /* Don't deactivate if the state has been changed to TASK_RUNNING */
- if (state == TASK_RUNNING)
- return false;
+ WARN_ON_ONCE(state == TASK_RUNNING);
+ WARN_ON_ONCE(donor->blocked_on);
/*
* Because we got donor from pick_next_task(), it is *crucial*
* that we call proxy_resched_idle() before we deactivate it.
@@ -6667,7 +6761,7 @@ static bool proxy_deactivate(struct rq *rq, struct task_struct *donor)
* need to be changed from next *before* we deactivate.
*/
proxy_resched_idle(rq);
- return try_to_block_task(rq, donor, &state, true);
+ block_task(rq, donor, state);
}
static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf)
@@ -6741,76 +6835,21 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
proxy_reacquire_rq_lock(rq, rf);
}
-static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
- struct task_struct *p)
- __must_hold(__rq_lockp(rq))
-{
- struct rq *task_rq, *target_rq = NULL;
- int cpu, wake_flag = WF_TTWU;
-
- lockdep_assert_rq_held(rq);
- WARN_ON(p == rq->curr);
-
- if (p == rq->donor)
- proxy_resched_idle(rq);
-
- proxy_release_rq_lock(rq, rf);
- /*
- * We drop the rq lock, and re-grab task_rq_lock to get
- * the pi_lock (needed for select_task_rq) as well.
- */
- scoped_guard (task_rq_lock, p) {
- task_rq = scope.rq;
-
- /*
- * Since we let go of the rq lock, the task may have been
- * woken or migrated to another rq before we got the
- * task_rq_lock. So re-check we're on the same RQ. If
- * not, the task has already been migrated and that CPU
- * will handle any futher migrations.
- */
- if (task_rq != rq)
- break;
-
- /*
- * Similarly, if we've been dequeued, someone else will
- * wake us
- */
- if (!task_on_rq_queued(p))
- break;
-
- /*
- * Since we should only be calling here from __schedule()
- * -> find_proxy_task(), no one else should have
- * assigned current out from under us. But check and warn
- * if we see this, then bail.
- */
- if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) {
- WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n",
- __func__, cpu_of(task_rq),
- p->comm, p->pid, p->on_cpu);
- break;
- }
-
- update_rq_clock(task_rq);
- deactivate_task(task_rq, p, DEQUEUE_NOCLOCK);
- cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
- set_task_cpu(p, cpu);
- target_rq = cpu_rq(cpu);
- clear_task_blocked_on(p, NULL);
- }
-
- if (target_rq)
- attach_one_task(target_rq, p);
-
- proxy_reacquire_rq_lock(rq, rf);
-}
-
/*
* Find runnable lock owner to proxy for mutex blocked donor
*
* Follow the blocked-on relation:
- * task->blocked_on -> mutex->owner -> task...
+ *
+ * ,-> task
+ * | | blocked-on
+ * | v
+ * blocked_donor | mutex
+ * | | owner
+ * | v
+ * `-- task
+ *
+ * and set the blocked_donor relation, this latter is used by the mutex
+ * code to find which (blocked) task to hand-off to.
*
* Lock order:
*
@@ -6830,18 +6869,19 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
bool curr_in_chain = false;
int this_cpu = cpu_of(rq);
struct task_struct *p;
- struct mutex *mutex;
int owner_cpu;
/* Follow blocked_on chain. */
- for (p = donor; (mutex = p->blocked_on); p = owner) {
+ for (p = donor; p->is_blocked; p = owner) {
/* if its PROXY_WAKING, do return migration or run if current */
- if (mutex == PROXY_WAKING) {
+ struct mutex *mutex = p->blocked_on;
+ if (!mutex) {
+ clear_task_blocked_on(p, mutex);
if (task_current(rq, p)) {
- clear_task_blocked_on(p, PROXY_WAKING);
+ p->is_blocked = 0;
return p;
}
- goto force_return;
+ goto deactivate;
}
/*
@@ -6872,17 +6912,19 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
* and return p (if it is current and safe to
* just run on this rq), or return-migrate the task.
*/
+ __clear_task_blocked_on(p, NULL);
if (task_current(rq, p)) {
- __clear_task_blocked_on(p, NULL);
+ p->is_blocked = 0;
return p;
}
- goto force_return;
+ goto deactivate;
}
if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
/* XXX Don't handle blocked owners/delayed dequeue yet */
if (curr_in_chain)
return proxy_resched_idle(rq);
+ __clear_task_blocked_on(p, NULL);
goto deactivate;
}
@@ -6950,17 +6992,13 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
* rq, therefore holding @rq->lock is sufficient to
* guarantee its existence, as per ttwu_remote().
*/
+ owner->blocked_donor = p;
}
WARN_ON_ONCE(owner && !owner->on_rq);
return owner;
deactivate:
- if (proxy_deactivate(rq, donor))
- return NULL;
- /* If deactivate fails, force return */
- p = donor;
-force_return:
- proxy_force_return(rq, rf, p);
+ proxy_deactivate(rq, p);
return NULL;
migrate_task:
proxy_migrate_task(rq, rf, p, owner_cpu);
@@ -7102,13 +7140,14 @@ static void __sched notrace __schedule(int sched_mode)
pick_again:
assert_balance_callbacks_empty(rq);
- next = pick_next_task(rq, rq->donor, &rf);
+ next = pick_next_task(rq, &rf);
rq->next_class = next->sched_class;
if (sched_proxy_exec()) {
struct task_struct *prev_donor = rq->donor;
rq_set_donor(rq, next);
- if (unlikely(next->blocked_on)) {
+ next->blocked_donor = NULL;
+ if (unlikely(next->is_blocked)) {
next = find_proxy_task(rq, next, &rf);
if (!next) {
zap_balance_callbacks(rq);
@@ -7964,7 +8003,7 @@ static void __sched_dynamic_update(int mode)
break;
}
- preempt_dynamic_mode = mode;
+ WRITE_ONCE(preempt_dynamic_mode, mode);
}
void sched_dynamic_update(int mode)
@@ -8005,12 +8044,13 @@ static void __init preempt_dynamic_init(void)
}
}
-# define PREEMPT_MODEL_ACCESSOR(mode) \
- bool preempt_model_##mode(void) \
- { \
- WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
- return preempt_dynamic_mode == preempt_dynamic_##mode; \
- } \
+# define PREEMPT_MODEL_ACCESSOR(mode) \
+ bool preempt_model_##mode(void) \
+ { \
+ int mode = READ_ONCE(preempt_dynamic_mode); \
+ WARN_ON_ONCE(mode == preempt_dynamic_undefined); \
+ return mode == preempt_dynamic_##mode; \
+ } \
EXPORT_SYMBOL_GPL(preempt_model_##mode)
PREEMPT_MODEL_ACCESSOR(none);
@@ -8604,18 +8644,14 @@ static void cpuset_cpu_inactive(unsigned int cpu)
static inline void sched_smt_present_inc(int cpu)
{
-#ifdef CONFIG_SCHED_SMT
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_inc_cpuslocked(&sched_smt_present);
-#endif
}
static inline void sched_smt_present_dec(int cpu)
{
-#ifdef CONFIG_SCHED_SMT
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
-#endif
}
int sched_cpu_activate(unsigned int cpu)
@@ -8670,7 +8706,8 @@ int sched_cpu_deactivate(unsigned int cpu)
* Remove CPU from nohz.idle_cpus_mask to prevent participating in
* load balancing when not active
*/
- nohz_balance_exit_idle(rq);
+ scoped_guard (rcu)
+ nohz_balance_exit_idle(rq);
set_cpu_active(cpu, false);
@@ -8694,6 +8731,8 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
+ sched_domains_free_llc_id(cpu);
+
sched_set_rq_offline(rq, cpu);
scx_rq_deactivate(rq);
@@ -8703,9 +8742,7 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
sched_smt_present_dec(cpu);
-#ifdef CONFIG_SCHED_SMT
sched_core_cpu_deactivate(cpu);
-#endif
if (!sched_smp_initialized)
return 0;
@@ -8873,7 +8910,7 @@ static struct kmem_cache *task_group_cache __ro_after_init;
void __init sched_init(void)
{
- unsigned long ptr = 0;
+ unsigned long __maybe_unused ptr = 0;
int i;
/* Make sure the linker didn't screw up */
@@ -8889,36 +8926,24 @@ void __init sched_init(void)
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
- ptr += 2 * nr_cpu_ids * sizeof(void **);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- ptr += 2 * nr_cpu_ids * sizeof(void **);
-#endif
- if (ptr) {
- ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.se = (struct sched_entity **)ptr;
- ptr += nr_cpu_ids * sizeof(void **);
+ root_task_group.cfs_rq = &runqueues.cfs;
- root_task_group.cfs_rq = (struct cfs_rq **)ptr;
- ptr += nr_cpu_ids * sizeof(void **);
-
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_EXT_GROUP_SCHED
- scx_tg_init(&root_task_group);
+ scx_tg_init(&root_task_group);
#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
- root_task_group.rt_se = (struct sched_rt_entity **)ptr;
- ptr += nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
+ root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
- root_task_group.rt_rq = (struct rt_rq **)ptr;
- ptr += nr_cpu_ids * sizeof(void **);
+ root_task_group.rt_rq = (struct rt_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
- }
init_defrootdomain();
@@ -9027,6 +9052,11 @@ void __init sched_init(void)
rq->core_cookie = 0UL;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ raw_spin_lock_init(&rq->cpu_epoch_lock);
+ rq->cpu_epoch_next = jiffies;
+#endif
+
zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
@@ -9828,15 +9858,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg,
}
for_each_online_cpu(i) {
- struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+ struct cfs_rq *cfs_rq = tg_cfs_rq(tg, i);
struct rq *rq = cfs_rq->rq;
guard(rq_lock_irq)(rq);
+
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 1;
- if (cfs_rq->throttled)
+ if (cfs_rq->throttled) {
+ update_rq_clock(rq);
unthrottle_cfs_rq(cfs_rq);
+ }
}
if (runtime_was_enabled && !runtime_enabled)
@@ -9977,7 +10010,7 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
int i;
for_each_possible_cpu(i) {
- stats = __schedstats_from_se(tg->se[i]);
+ stats = __schedstats_from_se(tg_se(tg, i));
ws += schedstat_val(stats->wait_sum);
}
@@ -9996,7 +10029,7 @@ static u64 throttled_time_self(struct task_group *tg)
u64 total = 0;
for_each_possible_cpu(i) {
- total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ total += READ_ONCE(tg_cfs_rq(tg, i)->throttled_clock_self_time);
}
return total;
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 73b6b2426911..43e0bde3038e 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -136,7 +136,7 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
struct pid *grp;
int err = 0;
- if (!static_branch_likely(&sched_smt_present))
+ if (!sched_smt_active())
return -ENODEV;
BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index fbf31db0d2f3..679ac65be6b0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,6 +2,7 @@
/*
* Simple CPU accounting cgroup controller
*/
+#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
#include <linux/tsacct_kern.h>
#include "sched.h"
@@ -46,7 +47,8 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
u64_stats_update_begin(&irqtime->sync);
cpustat[idx] += delta;
irqtime->total += delta;
- irqtime->tick_delta += delta;
+ if (!kcpustat_idle_dyntick())
+ irqtime->tick_delta += delta;
u64_stats_update_end(&irqtime->sync);
}
@@ -414,16 +416,219 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
}
}
-static void irqtime_account_idle_ticks(int ticks)
-{
- irqtime_account_process_tick(current, 0, ticks);
-}
#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
-static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
int nr_ticks) { }
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_NO_HZ_COMMON
+static void kcpustat_idle_stop(struct kernel_cpustat *kc, u64 now)
+{
+ u64 *cpustat = kc->cpustat;
+ u64 delta, steal, steal_delta;
+ int iowait;
+
+ if (!kc->idle_elapse)
+ return;
+
+ iowait = nr_iowait_cpu(smp_processor_id()) > 0;
+ delta = now - kc->idle_entrytime;
+ steal = steal_account_process_time(delta);
+
+ /*
+ * Record the idle time after substracting the steal time from
+ * previous update sequence. Don't substract the steal time from
+ * the current update sequence to avoid readers moving backward.
+ */
+ write_seqcount_begin(&kc->idle_sleeptime_seq);
+ steal_delta = min_t(u64, kc->idle_stealtime[iowait], delta);
+ delta -= steal_delta;
+ kc->idle_stealtime[iowait] -= steal_delta;
+
+ if (iowait)
+ cpustat[CPUTIME_IOWAIT] += delta;
+ else
+ cpustat[CPUTIME_IDLE] += delta;
+
+ kc->idle_stealtime[iowait] += steal;
+ kc->idle_entrytime = now;
+ kc->idle_elapse = false;
+ write_seqcount_end(&kc->idle_sleeptime_seq);
+}
+
+static void kcpustat_idle_start(struct kernel_cpustat *kc, u64 now)
+{
+ /* Irqtime accounting might have been enabled in the middle of the IRQ */
+ if (kc->idle_elapse)
+ return;
+
+ write_seqcount_begin(&kc->idle_sleeptime_seq);
+ kc->idle_entrytime = now;
+ kc->idle_elapse = true;
+ write_seqcount_end(&kc->idle_sleeptime_seq);
+}
+
+void kcpustat_dyntick_stop(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu()) {
+ WARN_ON_ONCE(!kc->idle_dyntick);
+ kcpustat_idle_stop(kc, now);
+ kc->idle_dyntick = false;
+ vtime_dyntick_stop();
+ }
+}
+
+void kcpustat_dyntick_start(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu()) {
+ vtime_dyntick_start();
+ kc->idle_dyntick = true;
+ kcpustat_idle_start(kc, now);
+ }
+}
+
+void kcpustat_irq_enter(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu() &&
+ (irqtime_enabled() || vtime_accounting_enabled_this_cpu()))
+ kcpustat_idle_stop(kc, now);
+}
+
+void kcpustat_irq_exit(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ /*
+ * Generic vtime already does its own idle accounting.
+ * But irqtime accounting or arch vtime which also accounts IRQs
+ * need to pause nohz accounting. Resume nohz accounting as long
+ * as the irqtime config is enabled to handle case where irqtime
+ * accounting got runtime disabled in the middle of an IRQ.
+ */
+ if (!vtime_generic_enabled_this_cpu() &&
+ (IS_ENABLED(CONFIG_IRQ_TIME_ACCOUNTING) || vtime_accounting_enabled_this_cpu()))
+ kcpustat_idle_start(kc, now);
+}
+
+static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, u64 now)
+{
+ struct kernel_cpustat *kc = &kcpustat_cpu(cpu);
+ int iowait = idx == CPUTIME_IOWAIT;
+ u64 *cpustat = kc->cpustat;
+ unsigned int seq;
+ u64 idle;
+
+ do {
+ seq = read_seqcount_begin(&kc->idle_sleeptime_seq);
+
+ idle = cpustat[idx];
+
+ if (kc->idle_elapse && compute_delta && now > kc->idle_entrytime) {
+ u64 delta = now - kc->idle_entrytime;
+
+ delta -= min_t(u64, kc->idle_stealtime[iowait], delta);
+ idle += delta;
+ }
+ } while (read_seqcount_retry(&kc->idle_sleeptime_seq, seq));
+
+ return idle;
+}
+
+u64 kcpustat_field_idle(int cpu)
+{
+ return kcpustat_field_dyntick(cpu, CPUTIME_IDLE,
+ !nr_iowait_cpu(cpu), ktime_get());
+}
+EXPORT_SYMBOL_GPL(kcpustat_field_idle);
+
+u64 kcpustat_field_iowait(int cpu)
+{
+ return kcpustat_field_dyntick(cpu, CPUTIME_IOWAIT,
+ nr_iowait_cpu(cpu), ktime_get());
+}
+EXPORT_SYMBOL_GPL(kcpustat_field_iowait);
+#else
+static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, ktime_t now)
+{
+ return kcpustat_cpu(cpu).cpustat[idx];
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now = ktime_get();
+ u64 res;
+
+ if (vtime_generic_enabled_cpu(cpu))
+ res = kcpustat_field(idx, cpu);
+ else
+ res = kcpustat_field_dyntick(cpu, idx, compute_delta, now);
+
+ do_div(res, NSEC_PER_USEC);
+
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ return res;
+}
+
+/**
+ * get_cpu_idle_time_us - get the total idle time of a CPU
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cumulative idle time (since boot) for a given
+ * CPU, in microseconds. Note that this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * Return: total idle time of the @cpu
+ */
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+ return get_cpu_sleep_time_us(cpu, CPUTIME_IDLE,
+ !nr_iowait_cpu(cpu), last_update_time);
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+
+/**
+ * get_cpu_iowait_time_us - get the total iowait time of a CPU
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cumulative iowait time (since boot) for a given
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * Return: total iowait time of @cpu
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+ return get_cpu_sleep_time_us(cpu, CPUTIME_IOWAIT,
+ nr_iowait_cpu(cpu), last_update_time);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
/*
* Use precise platform statistics if available:
*/
@@ -437,11 +642,15 @@ void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
vtime_account_hardirq(tsk);
} else if (pc & SOFTIRQ_OFFSET) {
vtime_account_softirq(tsk);
- } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
- is_idle_task(tsk)) {
- vtime_account_idle(tsk);
+ } else if (!kcpustat_idle_dyntick()) {
+ if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
+ is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ } else {
+ vtime_account_kernel(tsk);
+ }
} else {
- vtime_account_kernel(tsk);
+ vtime_reset();
}
}
@@ -483,6 +692,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (vtime_accounting_enabled_this_cpu())
return;
+ if (kcpustat_idle_dyntick())
+ return;
+
if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1);
return;
@@ -505,29 +717,6 @@ void account_process_tick(struct task_struct *p, int user_tick)
}
/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
- u64 cputime, steal;
-
- if (irqtime_enabled()) {
- irqtime_account_idle_ticks(ticks);
- return;
- }
-
- cputime = ticks * TICK_NSEC;
- steal = steal_account_process_time(ULONG_MAX);
-
- if (steal >= cputime)
- return;
-
- cputime -= steal;
- account_idle_time(cputime);
-}
-
-/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
*
@@ -587,12 +776,6 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
}
stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
- /*
- * Because mul_u64_u64_div_u64() can approximate on some
- * achitectures; enforce the constraint that: a*b/(b+c) <= a.
- */
- if (unlikely(stime > rtime))
- stime = rtime;
update:
/*
@@ -773,9 +956,9 @@ void vtime_guest_exit(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
-void vtime_account_idle(struct task_struct *tsk)
+static void __vtime_account_idle(struct vtime *vtime)
{
- account_idle_time(get_vtime_delta(&tsk->vtime));
+ account_idle_time(get_vtime_delta(vtime));
}
void vtime_task_switch_generic(struct task_struct *prev)
@@ -784,7 +967,7 @@ void vtime_task_switch_generic(struct task_struct *prev)
write_seqcount_begin(&vtime->seqcount);
if (vtime->state == VTIME_IDLE)
- vtime_account_idle(prev);
+ __vtime_account_idle(vtime);
else
__vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
@@ -926,6 +1109,7 @@ static int kcpustat_field_vtime(u64 *cpustat,
int cpu, u64 *val)
{
struct vtime *vtime = &tsk->vtime;
+ struct rq *rq = cpu_rq(cpu);
unsigned int seq;
do {
@@ -967,6 +1151,14 @@ static int kcpustat_field_vtime(u64 *cpustat,
if (state == VTIME_GUEST && task_nice(tsk) > 0)
*val += vtime->gtime + vtime_delta(vtime);
break;
+ case CPUTIME_IDLE:
+ if (state == VTIME_IDLE && !atomic_read(&rq->nr_iowait))
+ *val += vtime_delta(vtime);
+ break;
+ case CPUTIME_IOWAIT:
+ if (state == VTIME_IDLE && atomic_read(&rq->nr_iowait) > 0)
+ *val += vtime_delta(vtime);
+ break;
default:
break;
}
@@ -975,16 +1167,15 @@ static int kcpustat_field_vtime(u64 *cpustat,
return 0;
}
-u64 kcpustat_field(struct kernel_cpustat *kcpustat,
- enum cpu_usage_stat usage, int cpu)
+u64 kcpustat_field(enum cpu_usage_stat usage, int cpu)
{
- u64 *cpustat = kcpustat->cpustat;
+ u64 *cpustat = kcpustat_cpu(cpu).cpustat;
u64 val = cpustat[usage];
struct rq *rq;
int err;
- if (!vtime_accounting_enabled_cpu(cpu))
- return val;
+ if (!vtime_generic_enabled_cpu(cpu))
+ return kcpustat_field_default(usage, cpu);
rq = cpu_rq(cpu);
@@ -1030,8 +1221,8 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
*dst = *src;
cpustat = dst->cpustat;
- /* Task is sleeping, dead or idle, nothing to add */
- if (state < VTIME_SYS)
+ /* Task is sleeping or dead, nothing to add */
+ if (state < VTIME_IDLE)
continue;
delta = vtime_delta(vtime);
@@ -1040,15 +1231,17 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
* Task runs either in user (including guest) or kernel space,
* add pending nohz time to the right place.
*/
- if (state == VTIME_SYS) {
+ switch (state) {
+ case VTIME_SYS:
cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
- } else if (state == VTIME_USER) {
+ break;
+ case VTIME_USER:
if (task_nice(tsk) > 0)
cpustat[CPUTIME_NICE] += vtime->utime + delta;
else
cpustat[CPUTIME_USER] += vtime->utime + delta;
- } else {
- WARN_ON_ONCE(state != VTIME_GUEST);
+ break;
+ case VTIME_GUEST:
if (task_nice(tsk) > 0) {
cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
cpustat[CPUTIME_NICE] += vtime->gtime + delta;
@@ -1056,6 +1249,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
cpustat[CPUTIME_USER] += vtime->gtime + delta;
}
+ break;
+ case VTIME_IDLE:
+ if (atomic_read(&cpu_rq(cpu)->nr_iowait) > 0)
+ cpustat[CPUTIME_IOWAIT] += delta;
+ else
+ cpustat[CPUTIME_IDLE] += delta;
+ break;
+ default:
+ WARN_ON_ONCE(1);
}
} while (read_seqcount_retry(&vtime->seqcount, seq));
@@ -1068,8 +1270,8 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
struct rq *rq;
int err;
- if (!vtime_accounting_enabled_cpu(cpu)) {
- *dst = *src;
+ if (!vtime_generic_enabled_cpu(cpu)) {
+ kcpustat_cpu_fetch_default(dst, cpu);
return;
}
@@ -1082,7 +1284,7 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
curr = rcu_dereference(rq->curr);
if (WARN_ON_ONCE(!curr)) {
rcu_read_unlock();
- *dst = *src;
+ kcpustat_cpu_fetch_default(dst, cpu);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index edca7849b165..0f858b98c9aa 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1515,8 +1515,12 @@ throttle:
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
if (dl_server(dl_se)) {
- replenish_dl_new_period(dl_se, rq);
- start_dl_timer(dl_se);
+ if (dl_se->dl_defer) {
+ replenish_dl_new_period(dl_se, rq);
+ start_dl_timer(dl_se);
+ } else {
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+ }
} else {
enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
}
@@ -1793,7 +1797,8 @@ void dl_server_start(struct sched_dl_entity *dl_se)
struct rq *rq = dl_se->rq;
dl_se->dl_defer_idle = 0;
- if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime)
+ if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime ||
+ !dl_se->dl_bw_attached)
return;
/*
@@ -1868,6 +1873,13 @@ void sched_init_dl_servers(void)
dl_se->dl_server = 1;
dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
+
+ /*
+ * No BPF scheduler is loaded at boot, so the ext_server has no
+ * tasks to protect. Detach its bandwidth reservation, it will
+ * be attached when a BPF scheduler is loaded.
+ */
+ dl_server_detach_bw(dl_se);
#endif
}
}
@@ -1878,6 +1890,9 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
int cpu = cpu_of(rq);
struct dl_bw *dl_b;
+ if (!dl_se->dl_bw_attached)
+ return;
+
dl_b = dl_bw_of(cpu_of(rq));
guard(raw_spinlock)(&dl_b->lock);
@@ -1889,7 +1904,8 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
{
- u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ u64 old_bw = (init || !dl_se->dl_bw_attached) ? 0 :
+ to_ratio(dl_se->dl_period, dl_se->dl_runtime);
u64 new_bw = to_ratio(period, runtime);
struct rq *rq = dl_se->rq;
int cpu = cpu_of(rq);
@@ -1909,7 +1925,8 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
if (init) {
__add_rq_bw(new_bw, &rq->dl);
__dl_add(dl_b, new_bw, cpus);
- } else {
+ dl_se->dl_bw_attached = 1;
+ } else if (dl_se->dl_bw_attached) {
__dl_sub(dl_b, dl_se->dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
@@ -1930,6 +1947,181 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
}
/*
+ * Add @dl_se's bw to the root-domain accounting.
+ *
+ * Return -EBUSY if attaching would overflow root domain capacity.
+ */
+static int __dl_server_attach_bw_locked(struct sched_dl_entity *dl_se,
+ struct dl_bw *dl_b, int cpus)
+{
+ struct rq *rq = dl_se->rq;
+ unsigned long cap;
+
+ /*
+ * Always update @rq->dl.this_bw, but only update @dl_b->total_bw
+ * (and run the overflow check it gates) while this CPU is active.
+ *
+ * This mirrors dl_server_add_bw() during root-domain rebuilds, which
+ * only publishes bandwidth from active CPUs into @dl_b.
+ */
+ if (cpu_active(cpu_of(rq))) {
+ cap = dl_bw_capacity(cpu_of(rq));
+ if (__dl_overflow(dl_b, cap, 0, dl_se->dl_bw))
+ return -EBUSY;
+ __dl_add(dl_b, dl_se->dl_bw, cpus);
+ }
+ __add_rq_bw(dl_se->dl_bw, &rq->dl);
+ dl_se->dl_bw_attached = 1;
+
+ return 0;
+}
+
+/*
+ * Drain @dl_se and remove its bw from the root-domain accounting.
+ */
+static void __dl_server_detach_bw_locked(struct sched_dl_entity *dl_se,
+ struct dl_bw *dl_b, int cpus)
+{
+ struct rq *rq = dl_se->rq;
+
+ /*
+ * If the server is still active (on_rq), dequeue it via
+ * dl_server_stop(); task_non_contending() will either subtract
+ * @dl_bw from running_bw immediately (0-lag passed) or set
+ * dl_non_contending and arm the inactive_timer.
+ */
+ if (dl_se->dl_server_active)
+ dl_server_stop(dl_se);
+
+ /*
+ * Drop @dl_se's contribution from this rq's bandwidth accounting,
+ * mirroring the __add_rq_bw() done at attach time.
+ */
+ dl_rq_change_utilization(rq, dl_se, 0);
+
+ /*
+ * Update @dl_b only while this CPU is active, matching
+ * dl_server_add_bw() during root-domain rebuilds.
+ *
+ * If this CPU is inactive, its bandwidth is not currently accounted in
+ * @dl_b->total_bw: either attach skipped adding it, or a rebuild
+ * already dropped it while re-publishing active CPUs only.
+ *
+ * In that case there is nothing to subtract from @dl_b. Just clear
+ * @dl_se->dl_bw_attached; if the CPU becomes active again, the next
+ * rebuild will re-publish its bandwidth.
+ */
+ if (cpu_active(cpu_of(rq)))
+ __dl_sub(dl_b, dl_se->dl_bw, cpus);
+ dl_se->dl_bw_attached = 0;
+}
+
+/*
+ * Attach @dl_se's bandwidth to the root domain's total_bw accounting.
+ *
+ * Use to dynamically register a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * already attached.
+ *
+ * Returns -EBUSY if attaching would overflow the root domain capacity.
+ */
+int dl_server_attach_bw(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ int cpus, ret;
+
+ if (dl_se->dl_bw_attached)
+ return 0;
+
+ scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) {
+ dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
+ ret = __dl_server_attach_bw_locked(dl_se, dl_b, cpus);
+ }
+ if (ret)
+ return ret;
+
+ /*
+ * The natural 0->nr_running transition that triggers dl_server_start()
+ * may have happened while @dl_se was still detached (e.g., between
+ * scx_bypass(false) and the scx_enable() re-balance loop), so kick a
+ * start here.
+ *
+ * dl_server_start() bails out cleanly if there's nothing to schedule or
+ * it's already active. Skip if @cpu is offline; the server will be
+ * started naturally on the first enqueue once @cpu comes back.
+ */
+ if (cpu_online(cpu))
+ dl_server_start(dl_se);
+
+ return 0;
+}
+
+/*
+ * Detach @dl_se's bandwidth from the root domain's total_bw accounting.
+ *
+ * Use to dynamically unregister a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * not currently attached.
+ */
+void dl_server_detach_bw(struct sched_dl_entity *dl_se)
+{
+ int cpu = cpu_of(dl_se->rq);
+ struct dl_bw *dl_b;
+ int cpus;
+
+ if (!dl_se->dl_bw_attached)
+ return;
+
+ dl_b = dl_bw_of(cpu);
+ guard(raw_spinlock)(&dl_b->lock);
+ cpus = dl_bw_cpus(cpu);
+ __dl_server_detach_bw_locked(dl_se, dl_b, cpus);
+}
+
+/*
+ * Atomically detach @detach_se and attach @attach_se on the same rq, holding
+ * @dl_b->lock across both operations so a concurrent sched_setattr() cannot
+ * steal the bandwidth freed by the detach before the attach can claim it.
+ *
+ * Both entities must live on the same rq (same root domain). Returns the
+ * result of the attach: -EBUSY if attaching @attach_se would overflow root
+ * domain capacity (in which case both servers end up detached).
+ */
+int dl_server_swap_bw(struct sched_dl_entity *detach_se,
+ struct sched_dl_entity *attach_se)
+{
+ struct rq *rq = detach_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ int cpus, ret;
+
+ WARN_ON_ONCE(attach_se->rq != rq);
+
+ scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) {
+ dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
+
+ if (detach_se->dl_bw_attached)
+ __dl_server_detach_bw_locked(detach_se, dl_b, cpus);
+
+ if (attach_se->dl_bw_attached)
+ ret = 0;
+ else
+ ret = __dl_server_attach_bw_locked(attach_se, dl_b, cpus);
+ }
+ if (ret)
+ return ret;
+
+ if (cpu_online(cpu))
+ dl_server_start(attach_se);
+
+ return 0;
+}
+
+/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
*/
@@ -2292,7 +2484,10 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
- if (is_dl_boosted(&p->dl)) {
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (is_dl_boosted(dl_se)) {
/*
* Because of delays in the detection of the overrun of a
* thread's runtime, it might be the case that a thread
@@ -2305,14 +2500,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*
* In this case, the boost overrides the throttle.
*/
- if (p->dl.dl_throttled) {
+ if (dl_se->dl_throttled) {
/*
* The replenish timer needs to be canceled. No
* problem if it fires concurrently: boosted threads
* are ignored in dl_task_timer().
*/
- cancel_replenish_timer(&p->dl);
- p->dl.dl_throttled = 0;
+ cancel_replenish_timer(dl_se);
+ dl_se->dl_throttled = 0;
}
} else if (!dl_prio(p->normal_prio)) {
/*
@@ -2324,7 +2519,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* being boosted again with no means to replenish the runtime and clear
* the throttle.
*/
- p->dl.dl_throttled = 0;
+ dl_se->dl_throttled = 0;
if (!(flags & ENQUEUE_REPLENISH))
printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n",
task_pid_nr(p));
@@ -2333,20 +2528,23 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
}
check_schedstat_required();
- update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
+ update_stats_wait_start_dl(dl_rq, dl_se);
- if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATING;
- enqueue_dl_entity(&p->dl, flags);
+ enqueue_dl_entity(dl_se, flags);
- if (dl_server(&p->dl))
+ if (dl_server(dl_se))
return;
if (task_is_blocked(p))
return;
- if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
+ if (dl_rq->curr == dl_se)
+ return;
+
+ if (!task_current(rq, p) && !dl_se->dl_throttled && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -2354,7 +2552,7 @@ static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
- if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ if (task_on_rq_migrating(p))
flags |= DEQUEUE_MIGRATING;
dequeue_dl_entity(&p->dl, flags);
@@ -2506,8 +2704,14 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}
-static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+static int balance_dl(struct rq *rq, struct rq_flags *rf)
{
+ /*
+ * Note, rq->donor may change during rq lock drops,
+ * so don't re-use prev across lock drops
+ */
+ struct task_struct *p = rq->donor;
+
if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
/*
* This is OK, because current is on_cpu, which avoids it being
@@ -2562,6 +2766,10 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
}
#endif /* !CONFIG_SCHED_HRTICK */
+/*
+ * DL keeps current in tree, because ->deadline is not typically changed while
+ * a task is runnable.
+ */
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -2574,6 +2782,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+ WARN_ON_ONCE(dl_rq->curr);
+ dl_rq->curr = dl_se;
+
if (!first)
return;
@@ -2637,17 +2848,20 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s
struct sched_dl_entity *dl_se = &p->dl;
struct dl_rq *dl_rq = &rq->dl;
- if (on_dl_rq(&p->dl))
+ if (on_dl_rq(dl_se))
update_stats_wait_start_dl(dl_rq, dl_se);
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+ WARN_ON_ONCE(dl_rq->curr != dl_se);
+ dl_rq->curr = NULL;
+
if (task_is_blocked(p))
return;
- if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+ if (on_dl_rq(dl_se) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -3107,20 +3321,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
struct affinity_context *ctx)
{
- struct root_domain *src_rd;
struct rq *rq;
WARN_ON_ONCE(!dl_task(p));
rq = task_rq(p);
- src_rd = rq->rd;
/*
* Migrating a SCHED_DEADLINE task between exclusive
* cpusets (different root_domains) entails a bandwidth
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
+ if (dl_task_needs_bw_move(p, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -3137,6 +3349,15 @@ static void set_cpus_allowed_dl(struct task_struct *p,
set_cpus_allowed_common(p, ctx);
}
+bool dl_task_needs_bw_move(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ if (!dl_task(p))
+ return false;
+
+ return !cpumask_intersects(task_rq(p)->rd->span, new_mask);
+}
+
/* Assumes rq->lock is held */
static void rq_online_dl(struct rq *rq)
{
@@ -3229,12 +3450,12 @@ static void dl_server_add_bw(struct root_domain *rd, int cpu)
struct sched_dl_entity *dl_se;
dl_se = &cpu_rq(cpu)->fair_server;
- if (dl_server(dl_se) && cpu_active(cpu))
+ if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
__dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
#ifdef CONFIG_SCHED_CLASS_EXT
dl_se = &cpu_rq(cpu)->ext_server;
- if (dl_server(dl_se) && cpu_active(cpu))
+ if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
__dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
#endif
}
@@ -3243,11 +3464,13 @@ static u64 dl_server_read_bw(int cpu)
{
u64 dl_bw = 0;
- if (cpu_rq(cpu)->fair_server.dl_server)
+ if (cpu_rq(cpu)->fair_server.dl_server &&
+ cpu_rq(cpu)->fair_server.dl_bw_attached)
dl_bw += cpu_rq(cpu)->fair_server.dl_bw;
#ifdef CONFIG_SCHED_CLASS_EXT
- if (cpu_rq(cpu)->ext_server.dl_server)
+ if (cpu_rq(cpu)->ext_server.dl_server &&
+ cpu_rq(cpu)->ext_server.dl_bw_attached)
dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 74c1617cf652..40584b27ea0c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -136,7 +136,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
if (cnt > 63)
cnt = 63;
- if (copy_from_user(&buf, ubuf, cnt))
+ if (copy_from_user(buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
@@ -210,6 +210,48 @@ static const struct file_operations sched_scaling_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CACHE
+static ssize_t
+sched_cache_enable_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ bool val;
+ int ret;
+
+ ret = kstrtobool_from_user(ubuf, cnt, &val);
+ if (ret)
+ return ret;
+
+ sysctl_sched_cache_user = val;
+
+ sched_cache_active_set();
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_cache_enable_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", sysctl_sched_cache_user);
+ return 0;
+}
+
+static int sched_cache_enable_open(struct inode *inode,
+ struct file *filp)
+{
+ return single_open(filp, sched_cache_enable_show, NULL);
+}
+
+static const struct file_operations sched_cache_enable_fops = {
+ .open = sched_cache_enable_open,
+ .write = sched_cache_enable_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
#ifdef CONFIG_PREEMPT_DYNAMIC
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
@@ -221,7 +263,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
if (cnt > 15)
cnt = 15;
- if (copy_from_user(&buf, ubuf, cnt))
+ if (copy_from_user(buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
@@ -239,6 +281,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2;
+ int mode = READ_ONCE(preempt_dynamic_mode);
int j;
/* Count entries in NULL terminated preempt_modes */
@@ -247,10 +290,10 @@ static int sched_dynamic_show(struct seq_file *m, void *v)
j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
for (; i < j; i++) {
- if (preempt_dynamic_mode == i)
+ if (mode == i)
seq_puts(m, "(");
seq_puts(m, preempt_modes[i]);
- if (preempt_dynamic_mode == i)
+ if (mode == i)
seq_puts(m, ")");
seq_puts(m, " ");
@@ -373,6 +416,9 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u
return -EINVAL;
}
+ if (!cpu_online(cpu_of(rq)))
+ return -EBUSY;
+
update_rq_clock(rq);
dl_server_stop(dl_se);
retval = dl_server_apply_params(dl_se, runtime, period, 0);
@@ -445,6 +491,8 @@ static const struct file_operations fair_server_runtime_fops = {
.release = single_release,
};
+static struct dentry *debugfs_sched;
+
#ifdef CONFIG_SCHED_CLASS_EXT
static ssize_t
sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf,
@@ -477,75 +525,92 @@ static const struct file_operations ext_server_runtime_fops = {
.llseek = seq_lseek,
.release = single_release,
};
-#endif /* CONFIG_SCHED_CLASS_EXT */
static ssize_t
-sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+sched_ext_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
struct rq *rq = cpu_rq(cpu);
return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD,
- &rq->fair_server);
+ &rq->ext_server);
}
-static int sched_fair_server_period_show(struct seq_file *m, void *v)
+static int sched_ext_server_period_show(struct seq_file *m, void *v)
{
unsigned long cpu = (unsigned long) m->private;
struct rq *rq = cpu_rq(cpu);
- return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server);
+ return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server);
}
-static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
+static int sched_ext_server_period_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_fair_server_period_show, inode->i_private);
+ return single_open(filp, sched_ext_server_period_show, inode->i_private);
}
-static const struct file_operations fair_server_period_fops = {
- .open = sched_fair_server_period_open,
- .write = sched_fair_server_period_write,
+static const struct file_operations ext_server_period_fops = {
+ .open = sched_ext_server_period_open,
+ .write = sched_ext_server_period_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
-#ifdef CONFIG_SCHED_CLASS_EXT
+static void debugfs_ext_server_init(void)
+{
+ struct dentry *d_ext;
+ unsigned long cpu;
+
+ d_ext = debugfs_create_dir("ext_server", debugfs_sched);
+ if (!d_ext)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%lu", cpu);
+ d_cpu = debugfs_create_dir(buf, d_ext);
+
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops);
+ }
+}
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static ssize_t
-sched_ext_server_period_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
struct rq *rq = cpu_rq(cpu);
return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD,
- &rq->ext_server);
+ &rq->fair_server);
}
-static int sched_ext_server_period_show(struct seq_file *m, void *v)
+static int sched_fair_server_period_show(struct seq_file *m, void *v)
{
unsigned long cpu = (unsigned long) m->private;
struct rq *rq = cpu_rq(cpu);
- return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server);
+ return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server);
}
-static int sched_ext_server_period_open(struct inode *inode, struct file *filp)
+static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_ext_server_period_show, inode->i_private);
+ return single_open(filp, sched_fair_server_period_show, inode->i_private);
}
-static const struct file_operations ext_server_period_fops = {
- .open = sched_ext_server_period_open,
- .write = sched_ext_server_period_write,
+static const struct file_operations fair_server_period_fops = {
+ .open = sched_fair_server_period_open,
+ .write = sched_fair_server_period_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
-#endif /* CONFIG_SCHED_CLASS_EXT */
-
-static struct dentry *debugfs_sched;
static void debugfs_fair_server_init(void)
{
@@ -568,32 +633,9 @@ static void debugfs_fair_server_init(void)
}
}
-#ifdef CONFIG_SCHED_CLASS_EXT
-static void debugfs_ext_server_init(void)
-{
- struct dentry *d_ext;
- unsigned long cpu;
-
- d_ext = debugfs_create_dir("ext_server", debugfs_sched);
- if (!d_ext)
- return;
-
- for_each_possible_cpu(cpu) {
- struct dentry *d_cpu;
- char buf[32];
-
- snprintf(buf, sizeof(buf), "cpu%lu", cpu);
- d_cpu = debugfs_create_dir(buf, d_ext);
-
- debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops);
- debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops);
- }
-}
-#endif /* CONFIG_SCHED_CLASS_EXT */
-
static __init int sched_init_debug(void)
{
- struct dentry __maybe_unused *numa;
+ struct dentry __maybe_unused *numa, *llc;
debugfs_sched = debugfs_create_dir("sched", NULL);
@@ -626,6 +668,22 @@ static __init int sched_init_debug(void)
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_SCHED_CACHE
+ llc = debugfs_create_dir("llc_balancing", debugfs_sched);
+ debugfs_create_file("enabled", 0644, llc, NULL,
+ &sched_cache_enable_fops);
+ debugfs_create_u32("aggr_tolerance", 0644, llc,
+ &llc_aggr_tolerance);
+ debugfs_create_u32("epoch_period", 0644, llc,
+ &llc_epoch_period);
+ debugfs_create_u32("epoch_affinity_timeout", 0644, llc,
+ &llc_epoch_affinity_timeout);
+ debugfs_create_u32("overaggr_pct", 0644, llc,
+ &llc_overaggr_pct);
+ debugfs_create_u32("imb_pct", 0644, llc,
+ &llc_imb_pct);
+#endif
+
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
@@ -750,7 +808,7 @@ void dirty_sched_domain_sysctl(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
- struct sched_entity *se = tg->se[cpu];
+ struct sched_entity *se = tg_se(tg, cpu);
#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 345aa11b84b2..0db6fa2daea3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6,8 +6,6 @@
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
-#include <linux/btf_ids.h>
-#include "ext_idle.h"
static DEFINE_RAW_SPINLOCK(scx_sched_lock);
@@ -38,6 +36,15 @@ static const struct rhashtable_params scx_sched_hash_params = {
static struct rhashtable scx_sched_hash;
#endif
+/* see SCX_OPS_TID_TO_TASK */
+static const struct rhashtable_params scx_tid_hash_params = {
+ .key_len = sizeof_field(struct sched_ext_entity, tid),
+ .key_offset = offsetof(struct sched_ext_entity, tid),
+ .head_offset = offsetof(struct sched_ext_entity, tid_hash_node),
+ .insecure_elasticity = true, /* inserted/removed under scx_tasks_lock */
+};
+static struct rhashtable scx_tid_hash;
+
/*
* During exit, a task may schedule after losing its PIDs. When disabling the
* BPF scheduler, we need to be able to iterate tasks in every state to
@@ -56,10 +63,25 @@ static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled);
+
+/*
+ * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler
+ * and the tid->task table is live. Wraps the static key so callers don't
+ * take the address, and hints "likely enabled" for the common case where
+ * the feature is in use.
+ */
+static inline bool scx_tid_to_task_enabled(void)
+{
+ return static_branch_likely(&__scx_tid_to_task_enabled);
+}
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+/* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */
+static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1);
+
#ifdef CONFIG_EXT_SUB_SCHED
/*
* The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
@@ -109,6 +131,17 @@ struct scx_kick_syncs {
static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
/*
+ * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of
+ * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without
+ * further synchronization. See scx_alloc_tid().
+ */
+struct scx_tid_alloc {
+ u64 next;
+ u64 end;
+};
+static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc);
+
+/*
* Direct dispatch marker.
*
* Non-NULL values are used for direct dispatch from enqueue path. A valid
@@ -198,26 +231,21 @@ static void run_deferred(struct rq *rq);
static bool task_dead_and_done(struct task_struct *p);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
-static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
- s64 exit_code, const char *fmt, va_list args);
-static __printf(4, 5) bool scx_exit(struct scx_sched *sch,
- enum scx_exit_kind kind, s64 exit_code,
- const char *fmt, ...)
+__printf(5, 6) bool __scx_exit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ s32 exit_cpu, const char *fmt, ...)
{
va_list args;
bool ret;
va_start(args, fmt);
- ret = scx_vexit(sch, kind, exit_code, fmt, args);
+ ret = scx_vexit(sch, kind, exit_code, exit_cpu, fmt, args);
va_end(args);
return ret;
}
-#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
-#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args)
-
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
static long jiffies_delta_msecs(unsigned long at, unsigned long now)
@@ -295,10 +323,9 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
rcu_assign_pointer(p->scx.sched, sch);
}
#else /* CONFIG_EXT_SUB_SCHED */
-static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
-static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
-static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; }
-static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
+static inline struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
+static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
#endif /* CONFIG_EXT_SUB_SCHED */
/**
@@ -485,6 +512,33 @@ do { \
update_locked_rq(__prev_locked_rq); \
} while (0)
+/*
+ * Flipped on enable per sch->is_cid_type. Declared in ext_internal.h so
+ * subsystem inlines can read it.
+ */
+DEFINE_STATIC_KEY_FALSE(__scx_is_cid_type);
+
+/*
+ * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form
+ * schedulers it resolves to the matching cid; for cpu-form it passes @cpu
+ * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op
+ * (currently only ops.select_cpu); it validates the BPF-supplied cid and
+ * triggers scx_error() on @sch if invalid.
+ */
+static s32 scx_cpu_arg(s32 cpu)
+{
+ if (scx_is_cid_type())
+ return __scx_cpu_to_cid(cpu);
+ return cpu;
+}
+
+static s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid)
+{
+ if (cpu_or_cid < 0 || !scx_is_cid_type())
+ return cpu_or_cid;
+ return scx_cid_to_cpu(sch, cpu_or_cid);
+}
+
#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \
({ \
struct rq *__prev_locked_rq; \
@@ -546,6 +600,44 @@ do { \
__ret; \
})
+/**
+ * scx_call_op_set_cpumask - invoke ops.set_cpumask / ops_cid.set_cmask for @task
+ * @sch: scx_sched being invoked
+ * @rq: rq to update as the currently-locked rq, or NULL
+ * @task: task whose affinity is changing
+ * @cpumask: new cpumask
+ *
+ * For cid-form schedulers, translate @cpumask to a cmask via the per-cpu
+ * scratch in ext_cid.c and dispatch through the ops_cid union view. Caller
+ * must hold @rq's rq lock so this_cpu_ptr is stable across the call.
+ */
+static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *task,
+ const struct cpumask *cpumask)
+{
+ WARN_ON_ONCE(current->scx.kf_tasks[0]);
+ current->scx.kf_tasks[0] = task;
+ if (rq)
+ update_locked_rq(rq);
+
+ if (scx_is_cid_type()) {
+ struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+ /*
+ * Build the per-CPU arena cmask and hand BPF its arena address.
+ * Caller holds the rq lock with IRQs disabled, which makes us
+ * the sole user of the scratch area.
+ */
+ scx_cpumask_to_cmask(cpumask, kern_va);
+ sch->ops_cid.set_cmask(task, scx_kaddr_to_arena(sch, kern_va));
+ } else {
+ sch->ops.set_cpumask(task, cpumask);
+ }
+
+ if (rq)
+ update_locked_rq(NULL);
+ current->scx.kf_tasks[0] = NULL;
+}
+
/* see SCX_CALL_OP_TASK() */
static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch,
struct task_struct *p)
@@ -712,6 +804,51 @@ struct bpf_iter_scx_dsq {
} __attribute__((aligned(8)));
+static u32 scx_get_task_state(const struct task_struct *p)
+{
+ return p->scx.flags & SCX_TASK_STATE_MASK;
+}
+
+static void scx_set_task_state(struct task_struct *p, u32 state)
+{
+ u32 prev_state = scx_get_task_state(p);
+ bool warn = false;
+
+ switch (state) {
+ case SCX_TASK_NONE:
+ warn = prev_state == SCX_TASK_DEAD;
+ break;
+ case SCX_TASK_INIT_BEGIN:
+ warn = prev_state != SCX_TASK_NONE;
+ break;
+ case SCX_TASK_INIT:
+ warn = prev_state != SCX_TASK_INIT_BEGIN;
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+ break;
+ case SCX_TASK_READY:
+ warn = !(prev_state == SCX_TASK_INIT ||
+ prev_state == SCX_TASK_ENABLED);
+ break;
+ case SCX_TASK_ENABLED:
+ warn = prev_state != SCX_TASK_READY;
+ break;
+ case SCX_TASK_DEAD:
+ warn = !(prev_state == SCX_TASK_NONE ||
+ prev_state == SCX_TASK_INIT_BEGIN);
+ break;
+ default:
+ WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
+ prev_state, state, p->comm, p->pid);
+ return;
+ }
+
+ WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
+ prev_state, state, p->comm, p->pid);
+
+ p->scx.flags &= ~SCX_TASK_STATE_MASK;
+ p->scx.flags |= state;
+}
+
/*
* SCX task iterator.
*/
@@ -766,7 +903,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
iter->cgrp = cgrp;
iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self);
- css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+ css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
+ &iter->css_iter);
return;
}
#endif
@@ -813,6 +951,24 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
}
/**
+ * scx_task_iter_relock - Re-acquire scx_tasks_lock and, optionally, @p's rq
+ * @iter: iterator to relock
+ * @p: task whose rq to lock, or %NULL for scx_tasks_lock only
+ *
+ * Counterpart to scx_task_iter_unlock(). Locking @p's rq is optional. Once
+ * re-acquired, both locks are managed by the iterator from here on.
+ */
+static void scx_task_iter_relock(struct scx_task_iter *iter,
+ struct task_struct *p)
+{
+ __scx_task_iter_maybe_relock(iter);
+ if (p) {
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked_task = p;
+ }
+}
+
+/**
* scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
* @iter: iterator to exit
*
@@ -866,7 +1022,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
iter->css_pos = css_next_descendant_pre(iter->css_pos,
&iter->cgrp->self);
if (iter->css_pos)
- css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+ css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
+ &iter->css_iter);
}
return NULL;
}
@@ -926,16 +1083,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
*
* Test for idle_sched_class as only init_tasks are on it.
*/
- if (p->sched_class != &idle_sched_class)
- break;
- }
- if (!p)
- return NULL;
+ if (p->sched_class == &idle_sched_class)
+ continue;
- iter->rq = task_rq_lock(p, &iter->rf);
- iter->locked_task = p;
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked_task = p;
- return p;
+ /*
+ * cgroup_task_dead() removes the dead tasks from cset->tasks
+ * after sched_ext_dead() and cgroup iteration may see tasks
+ * which already finished sched_ext_dead(). %SCX_TASK_DEAD is
+ * set by sched_ext_dead() under @p's rq lock. Test it to
+ * avoid visiting tasks which are already dead from SCX POV.
+ */
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ __scx_task_iter_rq_unlock(iter);
+ continue;
+ }
+
+ return p;
+ }
+ return NULL;
}
/**
@@ -1029,7 +1197,7 @@ static inline bool __cpu_valid(s32 cpu)
}
/**
- * ops_cpu_valid - Verify a cpu number, to be used on ops input args
+ * scx_cpu_valid - Verify a cpu number, to be used on ops input args
* @sch: scx_sched to abort on error
* @cpu: cpu number which came from a BPF ops
* @where: extra information reported on error
@@ -1038,7 +1206,7 @@ static inline bool __cpu_valid(s32 cpu)
* Verify that it is in range and one of the possible cpus. If invalid, trigger
* an ops error.
*/
-static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
+bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
{
if (__cpu_valid(cpu)) {
return true;
@@ -1685,9 +1853,9 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
return &rq->scx.local_dsq;
if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
- s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+ s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK);
- if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ if (!scx_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
return find_global_dsq(sch, tcpu);
return &cpu_rq(cpu)->scx.local_dsq;
@@ -2021,6 +2189,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
/* dequeue is always temporary, don't reset runnable_at */
clr_task_runnable(p, false);
+retry:
/* acquire ensures that we see the preceding updates on QUEUED */
opss = atomic_long_read_acquire(&p->scx.ops_state);
@@ -2034,8 +2203,20 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- /* A queued task must always be in BPF scheduler's custody */
- WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY));
+ /*
+ * A queued task must always be in BPF scheduler's custody. If
+ * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another
+ * CPU has already passed call_task_dequeue() (which clears the
+ * flag), but has not yet written SCX_OPSS_NONE. That final
+ * store does not require this rq's lock, so retrying with
+ * cpu_relax() is bounded: we will observe NONE (or DISPATCHING,
+ * handled by the fallthrough) on a subsequent iteration.
+ */
+ if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) {
+ cpu_relax();
+ goto retry;
+ }
+
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
break;
@@ -2767,11 +2948,13 @@ scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
dspc->nr_tasks = 0;
if (nested) {
- SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL);
+ SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu),
+ prev_on_sch ? prev : NULL);
} else {
/* stash @prev so that nested invocations can access it */
rq->scx.sub_dispatch_prev = prev;
- SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL);
+ SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu),
+ prev_on_sch ? prev : NULL);
rq->scx.sub_dispatch_prev = NULL;
}
@@ -2829,7 +3012,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* core. This callback complements ->cpu_release(), which is
* emitted in switch_class().
*/
- if (SCX_HAS_OP(sch, cpu_acquire))
+ if (sch->ops.cpu_acquire)
SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL);
rq->scx.cpu_released = false;
}
@@ -2975,7 +3158,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
* next time that balance_one() is invoked.
*/
if (!rq->scx.cpu_released) {
- if (SCX_HAS_OP(sch, cpu_release)) {
+ if (sch->ops.cpu_release) {
struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(next_class),
.task = next,
@@ -3266,11 +3449,13 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
*ddsp_taskp = p;
this_rq()->scx.in_select_cpu = true;
- cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags);
+ cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p,
+ scx_cpu_arg(prev_cpu), wake_flags);
+ cpu = scx_cpu_ret(sch, cpu);
this_rq()->scx.in_select_cpu = false;
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
- if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
+ if (scx_cpu_valid(sch, cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
@@ -3316,7 +3501,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* designation pointless. Cast it away when calling the operation.
*/
if (SCX_HAS_OP(sch, set_cpumask))
- SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr);
+ scx_call_op_set_cpumask(sch, task_rq(p), p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
@@ -3338,9 +3523,9 @@ static void handle_hotplug(struct rq *rq, bool online)
scx_idle_update_selcpu_topology(&sch->ops);
if (online && SCX_HAS_OP(sch, cpu_online))
- SCX_CALL_OP(sch, cpu_online, NULL, cpu);
+ SCX_CALL_OP(sch, cpu_online, NULL, scx_cpu_arg(cpu));
else if (!online && SCX_HAS_OP(sch, cpu_offline))
- SCX_CALL_OP(sch, cpu_offline, NULL, cpu);
+ SCX_CALL_OP(sch, cpu_offline, NULL, scx_cpu_arg(cpu));
else
scx_exit(sch, SCX_EXIT_UNREG_KERN,
SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
@@ -3388,9 +3573,10 @@ static bool check_rq_for_timeouts(struct rq *rq)
last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
- scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
- "%s[%d] failed to run for %u.%03us",
- p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
+ __scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, cpu_of(rq),
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid, dur_ms / 1000,
+ dur_ms % 1000);
timed_out = true;
break;
}
@@ -3487,41 +3673,6 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
#endif /* CONFIG_EXT_GROUP_SCHED */
-static u32 scx_get_task_state(const struct task_struct *p)
-{
- return p->scx.flags & SCX_TASK_STATE_MASK;
-}
-
-static void scx_set_task_state(struct task_struct *p, u32 state)
-{
- u32 prev_state = scx_get_task_state(p);
- bool warn = false;
-
- switch (state) {
- case SCX_TASK_NONE:
- break;
- case SCX_TASK_INIT:
- warn = prev_state != SCX_TASK_NONE;
- break;
- case SCX_TASK_READY:
- warn = prev_state == SCX_TASK_NONE;
- break;
- case SCX_TASK_ENABLED:
- warn = prev_state != SCX_TASK_READY;
- break;
- default:
- WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
- prev_state, state, p->comm, p->pid);
- return;
- }
-
- WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
- prev_state, state, p->comm, p->pid);
-
- p->scx.flags &= ~SCX_TASK_STATE_MASK;
- p->scx.flags |= state;
-}
-
static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
{
int ret;
@@ -3573,22 +3724,6 @@ static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fo
return 0;
}
-static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
-{
- int ret;
-
- ret = __scx_init_task(sch, p, fork);
- if (!ret) {
- /*
- * While @p's rq is not locked. @p is not visible to the rest of
- * SCX yet and it's safe to update the flags and state.
- */
- p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
- scx_set_task_state(p, SCX_TASK_INIT);
- }
- return ret;
-}
-
static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p)
{
struct rq *rq = task_rq(p);
@@ -3703,7 +3838,8 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
* If set, @p exited between __scx_init_task() and scx_enable_task() in
* scx_sub_enable() and is initialized for both the associated sched and
* its parent. Exit for the child too - scx_enable_task() never ran for
- * it, so undo only init_task.
+ * it, so undo only init_task. The flag is only set on the sub-enable
+ * path, so it's always clear when @p arrives here in %SCX_TASK_NONE.
*/
if (p->scx.flags & SCX_TASK_SUB_INIT) {
if (!WARN_ON_ONCE(!scx_enabling_sub_sched))
@@ -3728,6 +3864,33 @@ void init_scx_entity(struct sched_ext_entity *scx)
scx->slice = SCX_SLICE_DFL;
}
+/* See scx_tid_alloc / scx_tid_cursor. */
+static u64 scx_alloc_tid(void)
+{
+ struct scx_tid_alloc *ta;
+
+ guard(preempt)();
+ ta = this_cpu_ptr(&scx_tid_alloc);
+
+ if (unlikely(ta->next >= ta->end)) {
+ ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor);
+ ta->end = ta->next + SCX_TID_CHUNK;
+ }
+ return ta->next++;
+}
+
+static void scx_tid_hash_insert(struct task_struct *p)
+{
+ int ret;
+
+ lockdep_assert_held(&scx_tasks_lock);
+
+ ret = rhashtable_lookup_insert_fast(&scx_tid_hash,
+ &p->scx.tid_hash_node,
+ scx_tid_hash_params);
+ WARN_ON_ONCE(ret);
+}
+
void scx_pre_fork(struct task_struct *p)
{
/*
@@ -3745,16 +3908,22 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
percpu_rwsem_assert_held(&scx_fork_rwsem);
+ p->scx.tid = scx_alloc_tid();
+
if (scx_init_task_enabled) {
#ifdef CONFIG_EXT_SUB_SCHED
struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
#else
struct scx_sched *sch = scx_root;
#endif
- ret = scx_init_task(sch, p, true);
- if (!ret)
- scx_set_task_sched(p, sch);
- return ret;
+ scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
+ ret = __scx_init_task(sch, p, true);
+ if (unlikely(ret)) {
+ scx_set_task_state(p, SCX_TASK_NONE);
+ return ret;
+ }
+ scx_set_task_state(p, SCX_TASK_INIT);
+ scx_set_task_sched(p, sch);
}
return 0;
@@ -3780,9 +3949,11 @@ void scx_post_fork(struct task_struct *p)
}
}
- raw_spin_lock_irq(&scx_tasks_lock);
- list_add_tail(&p->scx.tasks_node, &scx_tasks);
- raw_spin_unlock_irq(&scx_tasks_lock);
+ scoped_guard(raw_spinlock_irq, &scx_tasks_lock) {
+ list_add_tail(&p->scx.tasks_node, &scx_tasks);
+ if (scx_tid_to_task_enabled())
+ scx_tid_hash_insert(p);
+ }
percpu_up_read(&scx_fork_rwsem);
}
@@ -3833,28 +4004,41 @@ static bool task_dead_and_done(struct task_struct *p)
void sched_ext_dead(struct task_struct *p)
{
- unsigned long flags;
-
/*
* By the time control reaches here, @p has %TASK_DEAD set, switched out
* for the last time and then dropped the rq lock - task_dead_and_done()
* should be returning %true nullifying the straggling sched_class ops.
* Remove from scx_tasks and exit @p.
*/
- raw_spin_lock_irqsave(&scx_tasks_lock, flags);
- list_del_init(&p->scx.tasks_node);
- raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) {
+ list_del_init(&p->scx.tasks_node);
+ if (scx_tid_to_task_enabled())
+ rhashtable_remove_fast(&scx_tid_hash,
+ &p->scx.tid_hash_node,
+ scx_tid_hash_params);
+ }
/*
* @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
* ENABLED transitions can't race us. Disable ops for @p.
+ *
+ * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see
+ * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
+ * iteration is only used from sub-sched paths, which require root
+ * enabled. Root enable transitions every live task to at least READY.
+ *
+ * %INIT_BEGIN means ops.init_task() is running for @p. Don't call
+ * into ops; transition to %DEAD so the post-init recheck unwinds
+ * via scx_sub_init_cancel_task().
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_disable_and_exit_task(scx_task_sched(p), p);
+ if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN)
+ scx_disable_and_exit_task(scx_task_sched(p), p);
+ scx_set_task_state(p, SCX_TASK_DEAD);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3892,7 +4076,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
if (SCX_HAS_OP(sch, set_cpumask))
- SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr);
+ scx_call_op_set_cpumask(sch, rq, p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
@@ -3900,6 +4084,16 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
if (task_dead_and_done(p))
return;
+ /*
+ * %NONE means SCX is no longer tracking @p at the task level (e.g.
+ * scx_fail_parent() handed @p back to the parent at NONE pending the
+ * parent's own teardown). There is nothing to disable; calling
+ * scx_disable_task() would WARN on the non-%ENABLED state and trigger a
+ * NONE -> READY validation failure.
+ */
+ if (scx_get_task_state(p) == SCX_TASK_NONE)
+ return;
+
scx_disable_task(scx_task_sched(p), p);
}
@@ -4357,11 +4551,13 @@ void scx_cgroup_move_task(struct task_struct *p)
return;
/*
- * @p must have ops.cgroup_prep_move() called on it and thus
- * cgrp_moving_from set.
+ * scx_cgroup_can_attach() sets cgrp_moving_from only when the task's
+ * cgroup changes. Migration keys off css rather than cgroup identity,
+ * so it can hand an unchanged-cgroup task here with cgrp_moving_from
+ * NULL. Nothing to report to the BPF scheduler then, so skip it and
+ * keep prep_move and move paired.
*/
- if (SCX_HAS_OP(sch, cgroup_move) &&
- !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from)
SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p),
p, p->scx.cgrp_moving_from,
tg_cgrp(task_group(p)));
@@ -4463,9 +4659,9 @@ static void scx_cgroup_unlock(void)
#endif
}
#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
-static struct cgroup *root_cgroup(void) { return NULL; }
-static void scx_cgroup_lock(void) {}
-static void scx_cgroup_unlock(void) {}
+static inline struct cgroup *root_cgroup(void) { return NULL; }
+static inline void scx_cgroup_lock(void) {}
+static inline void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
#ifdef CONFIG_EXT_SUB_SCHED
@@ -4484,8 +4680,8 @@ static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
rcu_assign_pointer(pos->scx_sched, sch);
}
#else /* CONFIG_EXT_SUB_SCHED */
-static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
+static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
#endif /* CONFIG_EXT_SUB_SCHED */
/*
@@ -4771,6 +4967,48 @@ static const struct attribute_group scx_global_attr_group = {
static void free_pnode(struct scx_sched_pnode *pnode);
static void free_exit_info(struct scx_exit_info *ei);
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+ size_t size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+ int cpu;
+
+ if (!sch->is_cid_type || !sch->arena_pool)
+ return 0;
+
+ sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+ if (!sch->set_cmask_scratch)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ *slot = scx_arena_alloc(sch, size);
+ if (!*slot)
+ return -ENOMEM;
+ scx_cmask_init(*slot, 0, num_possible_cpus());
+ }
+ return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+ size_t size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+ int cpu;
+
+ if (!sch->set_cmask_scratch)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ scx_arena_free(sch, *slot, size);
+ }
+ free_percpu(sch->set_cmask_scratch);
+ sch->set_cmask_scratch = NULL;
+}
+
static void scx_sched_free_rcu_work(struct work_struct *work)
{
struct rcu_work *rcu_work = to_rcu_work(work);
@@ -4789,6 +5027,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
kfree(sch->cgrp_path);
if (sch_cgroup(sch))
cgroup_put(sch_cgroup(sch));
+ if (sch->sub_kset)
+ kobject_put(&sch->sub_kset->kobj);
#endif /* CONFIG_EXT_SUB_SCHED */
for_each_possible_cpu(cpu) {
@@ -4823,6 +5063,10 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ scx_set_cmask_scratch_free(sch);
+ scx_arena_pool_destroy(sch);
+ if (sch->arena_map)
+ bpf_map_put(sch->arena_map);
kfree(sch);
}
@@ -4912,10 +5156,30 @@ static const struct kset_uevent_ops scx_uevent_ops = {
*/
bool task_should_scx(int policy)
{
- if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
+ /* if disabled, nothing should be on it */
+ if (!scx_enabled())
return false;
+
+ /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */
if (READ_ONCE(scx_switching_all))
return true;
+
+ /*
+ * scx is tearing down - keep new SCHED_EXT tasks out.
+ *
+ * Must come after scx_switching_all test, which serves as a proxy
+ * for __scx_switched_all. While __scx_switched_all is set, we must
+ * return true via the branch above: a fork routed to fair would
+ * stall because next_active_class() skips fair.
+ *
+ * This can develop into a deadlock - scx holds scx_enable_mutex across
+ * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is
+ * the stalled task, the disable path can never grab the mutex to clear
+ * scx_switching_all.
+ */
+ if (unlikely(scx_enable_state() == SCX_DISABLING))
+ return false;
+
return policy == SCHED_EXT;
}
@@ -5494,6 +5758,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
if (!ei)
return NULL;
+ ei->exit_cpu = -1;
ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
@@ -5566,10 +5831,12 @@ static void refresh_watchdog(void)
static s32 scx_link_sched(struct scx_sched *sch)
{
+ const char *err_msg = "";
+ s32 ret = 0;
+
scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
#ifdef CONFIG_EXT_SUB_SCHED
struct scx_sched *parent = scx_parent(sch);
- s32 ret;
if (parent) {
/*
@@ -5579,15 +5846,16 @@ static s32 scx_link_sched(struct scx_sched *sch)
* parent can shoot us down.
*/
if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) {
- scx_error(sch, "parent disabled");
- return -ENOENT;
+ err_msg = "parent disabled";
+ ret = -ENOENT;
+ break;
}
ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
&sch->hash_node, scx_sched_hash_params);
if (ret) {
- scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
- return ret;
+ err_msg = "failed to insert into scx_sched_hash";
+ break;
}
list_add_tail(&sch->sibling, &parent->children);
@@ -5597,6 +5865,15 @@ static s32 scx_link_sched(struct scx_sched *sch)
list_add_tail_rcu(&sch->all, &scx_sched_all);
}
+ /*
+ * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after
+ * the guard above is released.
+ */
+ if (ret) {
+ scx_error(sch, "%s (%d)", err_msg, ret);
+ return ret;
+ }
+
refresh_watchdog();
return 0;
}
@@ -5628,6 +5905,26 @@ static void scx_disable_dump(struct scx_sched *sch)
sch->dump_disabled = true;
}
+static void scx_log_sched_disable(struct scx_sched *sch)
+{
+ struct scx_exit_info *ei = sch->exit_info;
+ const char *type = scx_parent(sch) ? "sub-scheduler" : "scheduler";
+
+ if (ei->kind >= SCX_EXIT_ERROR) {
+ pr_err("sched_ext: BPF %s \"%s\" disabled (%s)\n", type,
+ sch->ops.name, ei->reason);
+
+ if (ei->msg[0] != '\0')
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
+#ifdef CONFIG_STACKTRACE
+ stack_trace_print(ei->bt, ei->bt_len, 2);
+#endif
+ } else {
+ pr_info("sched_ext: BPF %s \"%s\" disabled (%s)\n", type,
+ sch->ops.name, ei->reason);
+ }
+}
+
#ifdef CONFIG_EXT_SUB_SCHED
static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
@@ -5666,7 +5963,7 @@ static void scx_fail_parent(struct scx_sched *sch,
scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
scx_disable_and_exit_task(sch, p);
- rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_sched(p, parent);
}
}
scx_task_iter_stop(&sti);
@@ -5714,14 +6011,11 @@ static void scx_sub_disable(struct scx_sched *sch)
WARN_ON_ONCE(!scx_task_on_sched(sch, p));
/*
- * If $p is about to be freed, nothing prevents $sch from
- * unloading before $p reaches sched_ext_free(). Disable and
- * exit $p right away.
+ * @p is pinned by the iter: css_task_iter_next() takes a
+ * reference and holds it until the next iter_next() call, so
+ * @p->usage is guaranteed > 0.
*/
- if (!tryget_task_struct(p)) {
- scx_disable_and_exit_task(sch, p);
- continue;
- }
+ get_task_struct(p);
scx_task_iter_unlock(&sti);
@@ -5744,6 +6038,21 @@ static void scx_sub_disable(struct scx_sched *sch)
}
rq = task_rq_lock(p, &rf);
+
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ /*
+ * sched_ext_dead() raced us between __scx_init_task()
+ * and this rq lock and ran exit_task() on @sch (the
+ * sched @p was on at that point), not on $parent.
+ * $parent's just-completed init is owed an exit_task()
+ * and we issue it here.
+ */
+ scx_sub_init_cancel_task(parent, p);
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ continue;
+ }
+
scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
/*
* $p is initialized for $parent and still attached to
@@ -5752,13 +6061,14 @@ static void scx_sub_disable(struct scx_sched *sch)
* $p having already been initialized, and then enable.
*/
scx_disable_and_exit_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
scx_set_task_state(p, SCX_TASK_INIT);
- rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_sched(p, parent);
scx_set_task_state(p, SCX_TASK_READY);
scx_enable_task(parent, p);
}
- task_rq_unlock(rq, p, &rf);
+ task_rq_unlock(rq, p, &rf);
put_task_struct(p);
}
scx_task_iter_stop(&sti);
@@ -5798,22 +6108,24 @@ static void scx_sub_disable(struct scx_sched *sch)
&sub_detach_args);
}
+ scx_log_sched_disable(sch);
+
if (sch->ops.exit)
SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
if (sch->sub_kset)
- kset_unregister(sch->sub_kset);
+ kobject_del(&sch->sub_kset->kobj);
kobject_del(&sch->kobj);
}
#else /* CONFIG_EXT_SUB_SCHED */
-static void drain_descendants(struct scx_sched *sch) { }
-static void scx_sub_disable(struct scx_sched *sch) { }
+static inline void drain_descendants(struct scx_sched *sch) { }
+static inline void scx_sub_disable(struct scx_sched *sch) { }
#endif /* CONFIG_EXT_SUB_SCHED */
static void scx_root_disable(struct scx_sched *sch)
{
- struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
+ bool was_switched_all;
int cpu;
/* guarantee forward progress and wait for descendants to be disabled */
@@ -5840,6 +6152,8 @@ static void scx_root_disable(struct scx_sched *sch)
*/
mutex_lock(&scx_enable_mutex);
+ was_switched_all = scx_switched_all();
+
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);
@@ -5889,34 +6203,51 @@ static void scx_root_disable(struct scx_sched *sch)
/*
* Invalidate all the rq clocks to prevent getting outdated
* rq clocks from a previous scx scheduler.
+ *
+ * Also re-balance the dl_server bandwidth reservations: detach
+ * ext_server (no more sched_ext tasks) and reinstate fair_server if it
+ * was previously detached because we were running in full mode.
+ *
+ * Unlike the enable path, this runs on a recovery path that cannot
+ * fail, so we use dl_server_swap_bw() to atomically free ext_server's
+ * bandwidth and reclaim it for fair_server under the same dl_b lock.
+ *
+ * The swap can still fail with -EBUSY if someone bumped ext_server's
+ * runtime via debugfs between enable and disable; in that narrow case
+ * both servers end up detached and we just WARN.
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
+
scx_rq_clock_invalidate(rq);
+
+ scoped_guard(rq_lock_irqsave, rq) {
+ update_rq_clock(rq);
+ if (was_switched_all) {
+ if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server,
+ &rq->fair_server)))
+ pr_warn("failed to re-attach fair_server on CPU %d\n", cpu);
+ } else {
+ dl_server_detach_bw(&rq->ext_server);
+ }
+ }
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_enabled);
+ static_branch_disable(&__scx_is_cid_type);
+ if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+ static_branch_disable(&__scx_tid_to_task_enabled);
bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
+ if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+ rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);
- if (ei->kind >= SCX_EXIT_ERROR) {
- pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- sch->ops.name, ei->reason);
-
- if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
-#ifdef CONFIG_STACKTRACE
- stack_trace_print(ei->bt, ei->bt_len, 2);
-#endif
- } else {
- pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- sch->ops.name, ei->reason);
- }
+ scx_log_sched_disable(sch);
if (sch->ops.exit)
- SCX_CALL_OP(sch, exit, NULL, ei);
+ SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
scx_unlink_sched(sch);
@@ -5935,7 +6266,7 @@ static void scx_root_disable(struct scx_sched *sch)
*/
#ifdef CONFIG_EXT_SUB_SCHED
if (sch->sub_kset)
- kset_unregister(sch->sub_kset);
+ kobject_del(&sch->sub_kset->kobj);
#endif
kobject_del(&sch->kobj);
@@ -6214,6 +6545,94 @@ static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_d
}
}
+static void scx_dump_cpu(struct scx_sched *sch, struct seq_buf *s,
+ struct scx_dump_ctx *dctx, int cpu,
+ bool dump_all_tasks)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct task_struct *p;
+ struct seq_buf ns;
+ size_t avail, used;
+ char *buf;
+ bool idle;
+
+ rq_lock_irqsave(rq, &rf);
+
+ idle = list_empty(&rq->scx.runnable_list) &&
+ rq->curr->sched_class == &idle_sched_class;
+
+ if (idle && !SCX_HAS_OP(sch, dump_cpu))
+ goto next;
+
+ /*
+ * We don't yet know whether ops.dump_cpu() will produce output
+ * and we may want to skip the default CPU dump if it doesn't.
+ * Use a nested seq_buf to generate the standard dump so that we
+ * can decide whether to commit later.
+ */
+ avail = seq_buf_get_buf(s, &buf);
+ seq_buf_init(&ns, buf, avail);
+
+ dump_newline(&ns);
+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
+ cpu, rq->scx.nr_running, rq->scx.flags,
+ rq->scx.cpu_released, rq->scx.ops_qseq,
+ rq->scx.kick_sync);
+ dump_line(&ns, " curr=%s[%d] class=%ps",
+ rq->curr->comm, rq->curr->pid,
+ rq->curr->sched_class);
+ if (!cpumask_empty(rq->scx.cpus_to_kick))
+ dump_line(&ns, " cpus_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick));
+ if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
+ dump_line(&ns, " idle_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
+ if (!cpumask_empty(rq->scx.cpus_to_preempt))
+ dump_line(&ns, " cpus_to_preempt: %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_preempt));
+ if (!cpumask_empty(rq->scx.cpus_to_wait))
+ dump_line(&ns, " cpus_to_wait : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_wait));
+ if (!cpumask_empty(rq->scx.cpus_to_sync))
+ dump_line(&ns, " cpus_to_sync : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_sync));
+
+ used = seq_buf_used(&ns);
+ if (SCX_HAS_OP(sch, dump_cpu)) {
+ ops_dump_init(&ns, " ");
+ SCX_CALL_OP(sch, dump_cpu, rq, dctx, scx_cpu_arg(cpu), idle);
+ ops_dump_exit();
+ }
+
+ /*
+ * If idle && nothing generated by ops.dump_cpu(), there's
+ * nothing interesting. Skip.
+ */
+ if (idle && used == seq_buf_used(&ns))
+ goto next;
+
+ /*
+ * $s may already have overflowed when $ns was created. If so,
+ * calling commit on it will trigger BUG.
+ */
+ if (avail) {
+ seq_buf_commit(s, seq_buf_used(&ns));
+ if (seq_buf_has_overflowed(&ns))
+ seq_buf_set_overflow(s);
+ }
+
+ if (rq->curr->sched_class == &ext_sched_class &&
+ (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
+ scx_dump_task(sch, s, dctx, rq, rq->curr, '*');
+
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+ if (dump_all_tasks || scx_task_on_sched(sch, p))
+ scx_dump_task(sch, s, dctx, rq, p, ' ');
+next:
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/*
* Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless
* of which scheduler they belong to. If false, only dump tasks owned by @sch.
@@ -6234,7 +6653,6 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
};
struct seq_buf s;
struct scx_event_stats events;
- char *buf;
int cpu;
guard(raw_spinlock_irqsave)(&scx_dump_lock);
@@ -6255,8 +6673,13 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
if (ei->kind == SCX_EXIT_NONE) {
dump_line(&s, "Debug dump triggered by %s", ei->reason);
} else {
- dump_line(&s, "%s[%d] triggered exit kind %d:",
- current->comm, current->pid, ei->kind);
+ if (ei->exit_cpu >= 0)
+ dump_line(&s, "%s[%d] triggered exit kind %d on CPU %d:",
+ current->comm, current->pid, ei->kind,
+ ei->exit_cpu);
+ else
+ dump_line(&s, "%s[%d] triggered exit kind %d:",
+ current->comm, current->pid, ei->kind);
dump_line(&s, " %s (%s)", ei->reason, ei->msg);
dump_newline(&s);
dump_line(&s, "Backtrace:");
@@ -6273,88 +6696,15 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
dump_line(&s, "CPU states");
dump_line(&s, "----------");
+ /*
+ * Dump the exit CPU first so it isn't lost to dump truncation, then
+ * walk the rest in order, skipping the one already dumped.
+ */
+ if (ei->exit_cpu >= 0)
+ scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks);
for_each_possible_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- struct task_struct *p;
- struct seq_buf ns;
- size_t avail, used;
- bool idle;
-
- rq_lock_irqsave(rq, &rf);
-
- idle = list_empty(&rq->scx.runnable_list) &&
- rq->curr->sched_class == &idle_sched_class;
-
- if (idle && !SCX_HAS_OP(sch, dump_cpu))
- goto next;
-
- /*
- * We don't yet know whether ops.dump_cpu() will produce output
- * and we may want to skip the default CPU dump if it doesn't.
- * Use a nested seq_buf to generate the standard dump so that we
- * can decide whether to commit later.
- */
- avail = seq_buf_get_buf(&s, &buf);
- seq_buf_init(&ns, buf, avail);
-
- dump_newline(&ns);
- dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
- cpu, rq->scx.nr_running, rq->scx.flags,
- rq->scx.cpu_released, rq->scx.ops_qseq,
- rq->scx.kick_sync);
- dump_line(&ns, " curr=%s[%d] class=%ps",
- rq->curr->comm, rq->curr->pid,
- rq->curr->sched_class);
- if (!cpumask_empty(rq->scx.cpus_to_kick))
- dump_line(&ns, " cpus_to_kick : %*pb",
- cpumask_pr_args(rq->scx.cpus_to_kick));
- if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
- dump_line(&ns, " idle_to_kick : %*pb",
- cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
- if (!cpumask_empty(rq->scx.cpus_to_preempt))
- dump_line(&ns, " cpus_to_preempt: %*pb",
- cpumask_pr_args(rq->scx.cpus_to_preempt));
- if (!cpumask_empty(rq->scx.cpus_to_wait))
- dump_line(&ns, " cpus_to_wait : %*pb",
- cpumask_pr_args(rq->scx.cpus_to_wait));
- if (!cpumask_empty(rq->scx.cpus_to_sync))
- dump_line(&ns, " cpus_to_sync : %*pb",
- cpumask_pr_args(rq->scx.cpus_to_sync));
-
- used = seq_buf_used(&ns);
- if (SCX_HAS_OP(sch, dump_cpu)) {
- ops_dump_init(&ns, " ");
- SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle);
- ops_dump_exit();
- }
-
- /*
- * If idle && nothing generated by ops.dump_cpu(), there's
- * nothing interesting. Skip.
- */
- if (idle && used == seq_buf_used(&ns))
- goto next;
-
- /*
- * $s may already have overflowed when $ns was created. If so,
- * calling commit on it will trigger BUG.
- */
- if (avail) {
- seq_buf_commit(&s, seq_buf_used(&ns));
- if (seq_buf_has_overflowed(&ns))
- seq_buf_set_overflow(&s);
- }
-
- if (rq->curr->sched_class == &ext_sched_class &&
- (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
- scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*');
-
- list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
- if (dump_all_tasks || scx_task_on_sched(sch, p))
- scx_dump_task(sch, &s, &dctx, rq, p, ' ');
- next:
- rq_unlock_irqrestore(rq, &rf);
+ if (cpu != ei->exit_cpu)
+ scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks);
}
dump_newline(&s);
@@ -6392,9 +6742,9 @@ static void scx_disable_irq_workfn(struct irq_work *irq_work)
kthread_queue_work(sch->helper, &sch->disable_work);
}
-static bool scx_vexit(struct scx_sched *sch,
- enum scx_exit_kind kind, s64 exit_code,
- const char *fmt, va_list args)
+bool scx_vexit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code, s32 exit_cpu,
+ const char *fmt, va_list args)
{
struct scx_exit_info *ei = sch->exit_info;
@@ -6416,6 +6766,7 @@ static bool scx_vexit(struct scx_sched *sch,
*/
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
+ ei->exit_cpu = exit_cpu;
irq_work_queue(&sch->disable_irq_work);
return true;
@@ -6473,13 +6824,32 @@ static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node)
}
/*
+ * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
+ * starvation. During the READY -> ENABLED task switching loop, the calling
+ * thread's sched_class gets switched from fair to ext. As fair has higher
+ * priority than ext, the calling thread can be indefinitely starved under
+ * fair-class saturation, leading to a system hang.
+ */
+struct scx_enable_cmd {
+ struct kthread_work work;
+ union {
+ struct sched_ext_ops *ops;
+ struct sched_ext_ops_cid *ops_cid;
+ };
+ bool is_cid_type;
+ struct bpf_map *arena_map; /* arena ref to transfer to sch */
+ int ret;
+};
+
+/*
* Allocate and initialize a new scx_sched. @cgrp's reference is always
* consumed whether the function succeeds or fails.
*/
-static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
+static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
struct cgroup *cgrp,
struct scx_sched *parent)
{
+ struct sched_ext_ops *ops = cmd->ops;
struct scx_sched *sch;
s32 level = parent ? parent->level + 1 : 0;
s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids;
@@ -6559,7 +6929,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
sch->slice_dfl = SCX_SLICE_DFL;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
- init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
+ sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn);
kthread_init_work(&sch->disable_work, scx_disable_workfn);
timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
@@ -6571,10 +6941,22 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
ret = -ENOMEM;
goto err_free_lb_cpumask;
}
- sch->ops = *ops;
+ /*
+ * Copy ops through the right union view. For cid-form the source is
+ * struct sched_ext_ops_cid which lacks the trailing cpu_acquire/
+ * cpu_release; those stay zero from kzalloc.
+ */
+ if (cmd->is_cid_type) {
+ sch->ops_cid = *cmd->ops_cid;
+ sch->is_cid_type = true;
+ } else {
+ sch->ops = *cmd->ops;
+ }
+
rcu_assign_pointer(ops->priv, sch);
sch->kobj.kset = scx_kset;
+ INIT_LIST_HEAD(&sch->all);
#ifdef CONFIG_EXT_SUB_SCHED
char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
@@ -6602,6 +6984,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0) {
+ RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
return ERR_PTR(ret);
}
@@ -6609,6 +6992,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
if (ops->sub_attach) {
sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
if (!sch->sub_kset) {
+ RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
return ERR_PTR(-ENOMEM);
}
@@ -6616,14 +7000,32 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
#else /* CONFIG_EXT_SUB_SCHED */
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0) {
+ RCU_INIT_POINTER(ops->priv, NULL);
kobject_put(&sch->kobj);
return ERR_PTR(ret);
}
#endif /* CONFIG_EXT_SUB_SCHED */
+
+ /*
+ * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+ * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+ * drops the ref. After this point, sch owns the ref and any cleanup
+ * runs through scx_sched_free_rcu_work() which puts it.
+ */
+ sch->arena_map = cmd->arena_map;
+ /* BPF arena is only available on MMU && 64BIT */
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+ if (sch->arena_map)
+ sch->arena_kern_base = bpf_arena_map_kern_vm_start(sch->arena_map);
+#endif
+ cmd->arena_map = NULL;
return sch;
+#ifdef CONFIG_EXT_SUB_SCHED
err_free_lb_resched:
+ RCU_INIT_POINTER(ops->priv, NULL);
free_cpumask_var(sch->bypass_lb_resched_cpumask);
+#endif
err_free_lb_cpumask:
free_cpumask_var(sch->bypass_lb_donee_cpumask);
err_stop_helper:
@@ -6688,6 +7090,17 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
}
/*
+ * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched
+ * may set it to declare a dependency; reject if the root hasn't
+ * enabled it.
+ */
+ if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) &&
+ !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) {
+ scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it");
+ return -EINVAL;
+ }
+
+ /*
* SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
* selection policy to be enabled.
*/
@@ -6697,25 +7110,34 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
return -EINVAL;
}
- if (ops->cpu_acquire || ops->cpu_release)
+ /*
+ * cid-form's struct is shorter and doesn't include the cpu_acquire /
+ * cpu_release tail; reading those fields off a cid-form @ops would
+ * run past the BPF allocation. Skip for cid-form.
+ */
+ if (!sch->is_cid_type && (ops->cpu_acquire || ops->cpu_release))
pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
+ /*
+ * Sub-scheduler support is tied to the cid-form struct_ops. A sub-sched
+ * attaches through a cid-form-only interface (sub_attach/sub_detach),
+ * and a root that accepts sub-scheds must expose cid-form state to
+ * them. Reject cpu-form schedulers on either side.
+ */
+ if (!sch->is_cid_type) {
+ if (scx_parent(sch)) {
+ scx_error(sch, "sub-sched requires cid-form struct_ops");
+ return -EINVAL;
+ }
+ if (ops->sub_attach || ops->sub_detach) {
+ scx_error(sch, "sub_attach/sub_detach requires cid-form struct_ops");
+ return -EINVAL;
+ }
+ }
+
return 0;
}
-/*
- * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
- * starvation. During the READY -> ENABLED task switching loop, the calling
- * thread's sched_class gets switched from fair to ext. As fair has higher
- * priority than ext, the calling thread can be indefinitely starved under
- * fair-class saturation, leading to a system hang.
- */
-struct scx_enable_cmd {
- struct kthread_work work;
- struct sched_ext_ops *ops;
- int ret;
-};
-
static void scx_root_enable_workfn(struct kthread_work *work)
{
struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
@@ -6733,19 +7155,41 @@ static void scx_root_enable_workfn(struct kthread_work *work)
goto err_unlock;
}
+ /*
+ * @ops->priv binds @ops to its scx_sched instance. It is set here by
+ * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(),
+ * which runs after scx_root_disable() has dropped scx_enable_mutex. If
+ * it's still non-NULL here, a previous attachment on @ops has not
+ * finished tearing down; proceeding would let the in-flight unreg's
+ * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign.
+ */
+ if (rcu_access_pointer(ops->priv)) {
+ ret = -EBUSY;
+ goto err_unlock;
+ }
+
ret = alloc_kick_syncs();
if (ret)
goto err_unlock;
+ if (ops->flags & SCX_OPS_TID_TO_TASK) {
+ ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params);
+ if (ret)
+ goto err_free_ksyncs;
+ }
+
#ifdef CONFIG_EXT_SUB_SCHED
cgroup_get(cgrp);
#endif
- sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
+ sch = scx_alloc_and_add_sched(cmd, cgrp, NULL);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
- goto err_free_ksyncs;
+ goto err_free_tid_hash;
}
+ if (sch->is_cid_type)
+ static_branch_enable(&__scx_is_cid_type);
+
/*
* Transition to ENABLING and clear exit info to arm the disable path.
* Failure triggers full disabling from here on.
@@ -6769,6 +7213,18 @@ static void scx_root_enable_workfn(struct kthread_work *work)
cpus_read_lock();
/*
+ * Build the cid mapping before publishing scx_root. The cid kfuncs
+ * dereference the cid arrays unconditionally once scx_prog_sched()
+ * returns non-NULL; the rcu_assign_pointer() below pairs with their
+ * rcu_dereference() to make the populated arrays visible.
+ */
+ ret = scx_cid_init(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
+ /*
* Make the scheduler instance visible. Must be inside cpus_read_lock().
* See handle_hotplug().
*/
@@ -6793,6 +7249,18 @@ static void scx_root_enable_workfn(struct kthread_work *work)
sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
+ ret = scx_arena_pool_init(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
@@ -6811,6 +7279,31 @@ static void scx_root_enable_workfn(struct kthread_work *work)
goto err_disable;
/*
+ * Attach the ext_server bandwidth reservation before anything is
+ * committed so that we can fail the enable if the root domain cannot
+ * accommodate it. The matching fair_server detach is deferred to the
+ * tail of this function, after the switch is fully committed and can no
+ * longer fail.
+ *
+ * On failure, err_disable funnels into scx_root_disable() which
+ * detaches ext_server, so partially-attached state is cleaned up
+ * automatically.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ scoped_guard(rq_lock_irqsave, rq) {
+ update_rq_clock(rq);
+ ret = dl_server_attach_bw(&rq->ext_server);
+ }
+ if (ret) {
+ pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n",
+ cpu, ret);
+ goto err_disable;
+ }
+ }
+
+ /*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace
* scheduling) may not function correctly before all tasks are switched.
@@ -6834,6 +7327,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
WARN_ON_ONCE(scx_init_task_enabled);
scx_init_task_enabled = true;
+ /* flip under fork_rwsem; the iter below covers existing tasks */
+ if (ops->flags & SCX_OPS_TID_TO_TASK)
+ static_branch_enable(&__scx_tid_to_task_enabled);
+
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
* preventing new tasks from being added. No need to exclude tasks
@@ -6856,26 +7353,60 @@ static void scx_root_enable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti, NULL);
while ((p = scx_task_iter_next_locked(&sti))) {
/*
- * @p may already be dead, have lost all its usages counts and
- * be waiting for RCU grace period before being freed. @p can't
- * be initialized for SCX in such cases and should be ignored.
+ * @p is in scx_tasks under scx_tasks_lock, and SCX_TASK_DEAD
+ * tasks are filtered by scx_task_iter_next_locked().
+ * sched_ext_dead() removes @p from scx_tasks under the same
+ * lock before put_task_struct_rcu_user() runs, so @p->usage
+ * is guaranteed > 0 here.
*/
- if (!tryget_task_struct(p))
- continue;
+ get_task_struct(p);
+ /*
+ * Set %INIT_BEGIN under the iter's rq lock so that a concurrent
+ * sched_ext_dead() does not call ops.exit_task() on @p while
+ * ops.init_task() is running. If sched_ext_dead() runs before
+ * this store, it has already removed @p from scx_tasks and the
+ * iter won't visit @p; if it runs after, it observes
+ * %INIT_BEGIN and transitions to %DEAD without calling ops,
+ * leaving the post-init recheck below to unwind.
+ */
+ scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
scx_task_iter_unlock(&sti);
- ret = scx_init_task(sch, p, false);
- if (ret) {
- put_task_struct(p);
+ ret = __scx_init_task(sch, p, false);
+
+ scx_task_iter_relock(&sti, p);
+
+ if (unlikely(ret)) {
+ if (scx_get_task_state(p) != SCX_TASK_DEAD)
+ scx_set_task_state(p, SCX_TASK_NONE);
scx_task_iter_stop(&sti);
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
ret, p->comm, p->pid);
+ put_task_struct(p);
goto err_disable_unlock_all;
}
- scx_set_task_sched(p, sch);
- scx_set_task_state(p, SCX_TASK_READY);
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ /*
+ * sched_ext_dead() observed %INIT_BEGIN and set %DEAD.
+ * ops.exit_task() is owed to the sched __scx_init_task()
+ * ran against; call it now.
+ */
+ scx_sub_init_cancel_task(sch, p);
+ } else {
+ scx_set_task_state(p, SCX_TASK_INIT);
+ scx_set_task_sched(p, sch);
+ scx_set_task_state(p, SCX_TASK_READY);
+ }
+
+ /*
+ * Insert into the tid hash. scx_tasks_lock is held by the iter;
+ * list_empty() guards against sched_ext_dead() having taken @p
+ * off the list while init ran unlocked.
+ */
+ if (scx_tid_to_task_enabled() && !list_empty(&p->scx.tasks_node))
+ scx_tid_hash_insert(p);
put_task_struct(p);
}
@@ -6926,6 +7457,25 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
static_branch_enable(&__scx_switched_all);
+ /*
+ * Detach the fair_server bandwidth reservation now that the switch
+ * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no
+ * task will ever run in the fair class, so give that bandwidth
+ * back to the RT class. The matching ext_server attach already
+ * happened earlier; this only releases bandwidth and cannot fail.
+ *
+ * In partial mode keep fair_server attached.
+ */
+ if (scx_switched_all()) {
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ dl_server_detach_bw(&rq->fair_server);
+ }
+ }
+
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
sch->ops.name, scx_switched_all() ? "" : " (partial)");
kobject_uevent(&sch->kobj, KOBJ_ADD);
@@ -6936,6 +7486,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
cmd->ret = 0;
return;
+err_free_tid_hash:
+ if (ops->flags & SCX_OPS_TID_TO_TASK)
+ rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);
err_free_ksyncs:
free_kick_syncs();
err_unlock:
@@ -7020,6 +7573,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
goto out_unlock;
}
+ /* See scx_root_enable_workfn() for the @ops->priv check. */
+ if (rcu_access_pointer(ops->priv)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
@@ -7037,7 +7596,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
raw_spin_unlock_irq(&scx_sched_lock);
/* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */
- sch = scx_alloc_and_add_sched(ops, cgrp, parent);
+ sch = scx_alloc_and_add_sched(cmd, cgrp, parent);
kobject_put(&parent->kobj);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
@@ -7064,6 +7623,14 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
+ ret = scx_arena_pool_init(sch);
+ if (ret)
+ goto err_disable;
+
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret)
+ goto err_disable;
+
if (validate_ops(sch, ops))
goto err_disable;
@@ -7126,9 +7693,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
if (p->scx.flags & SCX_TASK_SUB_INIT)
continue;
- /* see scx_root_enable() */
- if (!tryget_task_struct(p))
- continue;
+ /* @p is pinned by the iter; see scx_sub_disable() */
+ get_task_struct(p);
if (!assert_task_ready_or_enabled(p)) {
ret = -EINVAL;
@@ -7146,6 +7712,21 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
goto abort;
rq = task_rq_lock(p, &rf);
+
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ /*
+ * sched_ext_dead() raced us between __scx_init_task()
+ * and this rq lock and ran exit_task() on $parent (the
+ * sched @p was on at that point), not on @sch. @sch's
+ * just-completed init is owed an exit_task() and we
+ * issue it here.
+ */
+ scx_sub_init_cancel_task(sch, p);
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ continue;
+ }
+
p->scx.flags |= SCX_TASK_SUB_INIT;
task_rq_unlock(rq, p, &rf);
@@ -7180,7 +7761,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
* $p is now only initialized for @sch and READY, which
* is what we want. Assign it to @sch and enable.
*/
- rcu_assign_pointer(p->scx.sched, sch);
+ scx_set_task_sched(p, sch);
scx_enable_task(sch, p);
p->scx.flags &= ~SCX_TASK_SUB_INIT;
@@ -7276,14 +7857,12 @@ static s32 __init scx_cgroup_lifetime_notifier_init(void)
core_initcall(scx_cgroup_lifetime_notifier_init);
#endif /* CONFIG_EXT_SUB_SCHED */
-static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+static s32 scx_enable(struct scx_enable_cmd *cmd, struct bpf_link *link)
{
static struct kthread_worker *helper;
static DEFINE_MUTEX(helper_mutex);
- struct scx_enable_cmd cmd;
- if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
- cpu_possible_mask)) {
+ if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) {
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
return -EINVAL;
}
@@ -7304,16 +7883,15 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
}
#ifdef CONFIG_EXT_SUB_SCHED
- if (ops->sub_cgroup_id > 1)
- kthread_init_work(&cmd.work, scx_sub_enable_workfn);
+ if (cmd->ops->sub_cgroup_id > 1)
+ kthread_init_work(&cmd->work, scx_sub_enable_workfn);
else
#endif /* CONFIG_EXT_SUB_SCHED */
- kthread_init_work(&cmd.work, scx_root_enable_workfn);
- cmd.ops = ops;
+ kthread_init_work(&cmd->work, scx_root_enable_workfn);
- kthread_queue_work(READ_ONCE(helper), &cmd.work);
- kthread_flush_work(&cmd.work);
- return cmd.ret;
+ kthread_queue_work(READ_ONCE(helper), &cmd->work);
+ kthread_flush_work(&cmd->work);
+ return cmd->ret;
}
@@ -7485,7 +8063,62 @@ static int bpf_scx_check_member(const struct btf_type *t,
static int bpf_scx_reg(void *kdata, struct bpf_link *link)
{
- return scx_enable(kdata, link);
+ struct scx_enable_cmd cmd = { .ops = kdata };
+
+ return scx_enable(&cmd, link);
+}
+
+struct scx_arena_scan {
+ struct bpf_map *arena;
+ int err;
+};
+
+/*
+ * The verifier enforces one arena per BPF program, so each struct_ops
+ * member prog contributes at most one arena via bpf_prog_arena().
+ * Require all non-NULL contributions to match.
+ */
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+ struct scx_arena_scan *s = data;
+ struct bpf_map *arena = NULL;
+
+ /* arena.o, which defines these, is built only on MMU && 64BIT */
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+ arena = bpf_prog_arena(prog);
+#endif
+ if (!arena)
+ return 0;
+ if (s->arena && s->arena != arena) {
+ s->err = -EINVAL;
+ return 1;
+ }
+ s->arena = arena;
+ return 0;
+}
+
+static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
+{
+ struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+ struct scx_arena_scan scan = {};
+ int ret;
+
+ bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+ if (scan.err) {
+ pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+ return scan.err;
+ }
+ if (!scan.arena) {
+ pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+ return -EINVAL;
+ }
+
+ bpf_map_inc(scan.arena);
+ cmd.arena_map = scan.arena;
+ ret = scx_enable(&cmd, link);
+ if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */
+ bpf_map_put(cmd.arena_map);
+ return ret;
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
@@ -7619,6 +8252,73 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
.cfi_stubs = &__bpf_ops_sched_ext_ops
};
+/*
+ * cid-form cfi stubs. Stubs whose signatures match the cpu-form (param types
+ * identical, only param names differ across structs) are reused; only
+ * set_cmask needs a fresh stub since the second argument type differs.
+ */
+static void sched_ext_ops_cid__set_cmask(struct task_struct *p,
+ const struct scx_cmask *cmask) {}
+
+static struct sched_ext_ops_cid __bpf_ops_sched_ext_ops_cid = {
+ .select_cid = sched_ext_ops__select_cpu,
+ .enqueue = sched_ext_ops__enqueue,
+ .dequeue = sched_ext_ops__dequeue,
+ .dispatch = sched_ext_ops__dispatch,
+ .tick = sched_ext_ops__tick,
+ .runnable = sched_ext_ops__runnable,
+ .running = sched_ext_ops__running,
+ .stopping = sched_ext_ops__stopping,
+ .quiescent = sched_ext_ops__quiescent,
+ .yield = sched_ext_ops__yield,
+ .core_sched_before = sched_ext_ops__core_sched_before,
+ .set_weight = sched_ext_ops__set_weight,
+ .set_cmask = sched_ext_ops_cid__set_cmask,
+ .update_idle = sched_ext_ops__update_idle,
+ .init_task = sched_ext_ops__init_task,
+ .exit_task = sched_ext_ops__exit_task,
+ .enable = sched_ext_ops__enable,
+ .disable = sched_ext_ops__disable,
+#ifdef CONFIG_EXT_GROUP_SCHED
+ .cgroup_init = sched_ext_ops__cgroup_init,
+ .cgroup_exit = sched_ext_ops__cgroup_exit,
+ .cgroup_prep_move = sched_ext_ops__cgroup_prep_move,
+ .cgroup_move = sched_ext_ops__cgroup_move,
+ .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
+ .cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
+ .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
+ .cgroup_set_idle = sched_ext_ops__cgroup_set_idle,
+#endif
+ .sub_attach = sched_ext_ops__sub_attach,
+ .sub_detach = sched_ext_ops__sub_detach,
+ .cid_online = sched_ext_ops__cpu_online,
+ .cid_offline = sched_ext_ops__cpu_offline,
+ .init = sched_ext_ops__init,
+ .exit = sched_ext_ops__exit,
+ .dump = sched_ext_ops__dump,
+ .dump_cid = sched_ext_ops__dump_cpu,
+ .dump_task = sched_ext_ops__dump_task,
+};
+
+/*
+ * The cid-form struct_ops shares all bpf_struct_ops hooks with the cpu form.
+ * init_member, check_member, reg, unreg, etc. process kdata as the byte block
+ * verified to match by the BUILD_BUG_ON checks in scx_init().
+ */
+static struct bpf_struct_ops bpf_sched_ext_ops_cid = {
+ .verifier_ops = &bpf_scx_verifier_ops,
+ .reg = bpf_scx_reg_cid,
+ .unreg = bpf_scx_unreg,
+ .check_member = bpf_scx_check_member,
+ .init_member = bpf_scx_init_member,
+ .init = bpf_scx_init,
+ .update = bpf_scx_update,
+ .validate = bpf_scx_validate,
+ .name = "sched_ext_ops_cid",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &__bpf_ops_sched_ext_ops_cid
+};
+
/********************************************************************************
* System integration and init.
@@ -7628,13 +8328,11 @@ static void sysrq_handle_sched_ext_reset(u8 key)
{
struct scx_sched *sch;
- rcu_read_lock();
sch = rcu_dereference(scx_root);
if (likely(sch))
scx_disable(sch, SCX_EXIT_SYSRQ);
else
pr_info("sched_ext: BPF schedulers not loaded\n");
- rcu_read_unlock();
}
static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
@@ -7646,7 +8344,11 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
static void sysrq_handle_sched_ext_dump(u8 key)
{
- struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+ struct scx_exit_info ei = {
+ .kind = SCX_EXIT_NONE,
+ .exit_cpu = -1,
+ .reason = "SysRq-D",
+ };
struct scx_sched *sch;
list_for_each_entry_rcu(sch, &scx_sched_all, all)
@@ -8716,9 +9418,6 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
struct rq *this_rq;
unsigned long irq_flags;
- if (!ops_cpu_valid(sch, cpu, NULL))
- return;
-
local_irq_save(irq_flags);
this_rq = this_rq();
@@ -8781,11 +9480,36 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux
guard(rcu)();
sch = scx_prog_sched(aux);
- if (likely(sch))
+ if (likely(sch) && scx_cpu_valid(sch, cpu, NULL))
scx_kick_cpu(sch, cpu, flags);
}
/**
+ * scx_bpf_kick_cid - Trigger reschedule on the CPU mapped to @cid
+ * @cid: cid to kick
+ * @flags: %SCX_KICK_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_kick_cpu(). Return 0 on success,
+ * -errno otherwise.
+ */
+__bpf_kfunc s32 scx_bpf_kick_cid(s32 cid, u64 flags, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return -ENODEV;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return cpu;
+ scx_kick_cpu(sch, cpu, flags);
+ return 0;
+}
+
+/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
* @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
@@ -8811,9 +9535,9 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux
ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
goto out;
} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
- s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+ s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK);
- if (ops_cpu_valid(sch, cpu, NULL)) {
+ if (scx_cpu_valid(sch, cpu, NULL)) {
ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
goto out;
}
@@ -9031,6 +9755,7 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
__bpf_kfunc_end_defs();
+__printf(5, 0)
static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
size_t line_size, char *fmt, unsigned long long *data,
u32 data__sz)
@@ -9068,6 +9793,7 @@ static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
return ret;
}
+__printf(3, 0)
static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
char *fmt, unsigned long long *data, u32 data__sz)
{
@@ -9088,6 +9814,7 @@ __bpf_kfunc_start_defs();
* Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
* disabling.
*/
+__printf(2, 0)
__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
unsigned long long *data, u32 data__sz,
const struct bpf_prog_aux *aux)
@@ -9113,6 +9840,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
* Indicate that the BPF scheduler encountered a fatal error and initiate ops
* disabling.
*/
+__printf(1, 0)
__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
u32 data__sz, const struct bpf_prog_aux *aux)
{
@@ -9140,6 +9868,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
* The extra dump may be multiple lines. A single line may be split over
* multiple calls. The last line is automatically terminated.
*/
+__printf(1, 0)
__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
u32 data__sz, const struct bpf_prog_aux *aux)
{
@@ -9202,13 +9931,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux)
guard(rcu)();
sch = scx_prog_sched(aux);
- if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
+ if (likely(sch) && scx_cpu_valid(sch, cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
}
/**
+ * scx_bpf_cidperf_cap - Query the maximum relative capacity of the CPU at @cid
+ * @cid: cid of the CPU to query
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_cpuperf_cap().
+ */
+__bpf_kfunc u32 scx_bpf_cidperf_cap(s32 cid, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return SCX_CPUPERF_ONE;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return SCX_CPUPERF_ONE;
+ return arch_scale_cpu_capacity(cpu);
+}
+
+/**
* scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
* @cpu: CPU of interest
* @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
@@ -9230,13 +9982,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux)
guard(rcu)();
sch = scx_prog_sched(aux);
- if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
+ if (likely(sch) && scx_cpu_valid(sch, cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
}
/**
+ * scx_bpf_cidperf_cur - Query the current performance of the CPU at @cid
+ * @cid: cid of the CPU to query
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc u32 scx_bpf_cidperf_cur(s32 cid, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return SCX_CPUPERF_ONE;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return SCX_CPUPERF_ONE;
+ return arch_scale_freq_capacity(cpu);
+}
+
+/**
* scx_bpf_cpuperf_set - Set the relative performance target of a CPU
* @cpu: CPU of interest
* @perf: target performance level [0, %SCX_CPUPERF_ONE]
@@ -9266,7 +10041,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_au
return;
}
- if (ops_cpu_valid(sch, cpu, NULL)) {
+ if (scx_cpu_valid(sch, cpu, NULL)) {
struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
struct rq_flags rf;
@@ -9297,6 +10072,31 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_au
}
/**
+ * scx_bpf_cidperf_set - Set the performance target of the CPU at @cid
+ * @cid: cid of the CPU to target
+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_cpuperf_set().
+ */
+__bpf_kfunc void scx_bpf_cidperf_set(s32 cid, u32 perf,
+ const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return;
+ scx_bpf_cpuperf_set(cpu, perf, aux);
+}
+
+/**
* scx_bpf_nr_node_ids - Return the number of possible node IDs
*
* All valid node IDs in the system are smaller than the returned value.
@@ -9317,6 +10117,47 @@ __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
}
/**
+ * scx_bpf_nr_cids - Return the size of the cid space
+ *
+ * Equals num_possible_cpus(). All valid cids are in [0, return value).
+ */
+__bpf_kfunc u32 scx_bpf_nr_cids(void)
+{
+ return num_possible_cpus();
+}
+
+/**
+ * scx_bpf_nr_online_cids - Return current count of online CPUs in cid space
+ *
+ * Return num_online_cpus(). The standard model restarts the scheduler on
+ * hotplug, which lets schedulers treat [0, nr_online_cids) as the online
+ * range. Schedulers that prefer to handle hotplug without a restart should
+ * install a custom mapping via scx_bpf_cid_override() and track onlining
+ * through the ops.cid_online / ops.cid_offline callbacks.
+ */
+__bpf_kfunc u32 scx_bpf_nr_online_cids(void)
+{
+ return num_online_cpus();
+}
+
+/**
+ * scx_bpf_this_cid - Return the cid of the CPU this program is running on
+ *
+ * cid-addressed equivalent of bpf_get_smp_processor_id() for scx programs.
+ * The current cpu is trivially valid, so this is just a table lookup. Return
+ * -EINVAL if called from a non-SCX program before any scheduler has ever
+ * been enabled (the cid table is still unallocated at that point).
+ */
+__bpf_kfunc s32 scx_bpf_this_cid(void)
+{
+ s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl);
+
+ if (!tbl)
+ return -EINVAL;
+ return tbl[raw_smp_processor_id()];
+}
+
+/**
* scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
@@ -9365,6 +10206,23 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
}
/**
+ * scx_bpf_task_cid - cid a task is currently associated with
+ * @p: task of interest
+ *
+ * cid-addressed equivalent of scx_bpf_task_cpu(). task_cpu(p) is always a
+ * valid cpu, so this is just a table lookup. Return -EINVAL if called from
+ * a non-SCX program before any scheduler has ever been enabled.
+ */
+__bpf_kfunc s32 scx_bpf_task_cid(const struct task_struct *p)
+{
+ s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl);
+
+ if (!tbl)
+ return -EINVAL;
+ return tbl[task_cpu(p)];
+}
+
+/**
* scx_bpf_cpu_rq - Fetch the rq of a CPU
* @cpu: CPU of the rq
* @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
@@ -9379,7 +10237,7 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux)
if (unlikely(!sch))
return NULL;
- if (!ops_cpu_valid(sch, cpu, NULL))
+ if (!scx_cpu_valid(sch, cpu, NULL))
return NULL;
if (!sch->warned_deprecated_rq) {
@@ -9436,13 +10294,65 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_
if (unlikely(!sch))
return NULL;
- if (!ops_cpu_valid(sch, cpu, NULL))
+ if (!scx_cpu_valid(sch, cpu, NULL))
return NULL;
return rcu_dereference(cpu_rq(cpu)->curr);
}
/**
+ * scx_bpf_cid_curr - Return the curr task on the CPU at @cid
+ * @cid: cid of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_cpu_curr(). Callers must hold RCU
+ * read lock (KF_RCU).
+ */
+__bpf_kfunc struct task_struct *scx_bpf_cid_curr(s32 cid, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return NULL;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return NULL;
+ return rcu_dereference(cpu_rq(cpu)->curr);
+}
+
+/**
+ * scx_bpf_tid_to_task - Look up a task by its scx tid
+ * @tid: task ID previously read from p->scx.tid
+ *
+ * Returns the task with the given tid, or NULL if no such task exists. The
+ * returned pointer is valid until the end of the current RCU read section
+ * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root
+ * scheduler; otherwise an error is raised and NULL returned.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid)
+{
+ struct sched_ext_entity *scx;
+
+ if (!scx_tid_to_task_enabled()) {
+ struct scx_sched *sch = rcu_dereference(scx_root);
+
+ if (sch)
+ scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK");
+ return NULL;
+ }
+
+ scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params);
+ if (!scx)
+ return NULL;
+
+ return container_of(scx, struct task_struct, scx);
+}
+
+/**
* scx_bpf_now - Returns a high-performance monotonically non-decreasing
* clock for the current CPU. The clock returned is in nanoseconds.
*
@@ -9601,6 +10511,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_kick_cid, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
@@ -9615,16 +10526,25 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cidperf_cap, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cidperf_cur, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cidperf_set, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
+BTF_ID_FLAGS(func, scx_bpf_nr_cids)
+BTF_ID_FLAGS(func, scx_bpf_nr_online_cids)
+BTF_ID_FLAGS(func, scx_bpf_this_cid)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_cid, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL)
BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_cid_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, scx_bpf_now)
BTF_ID_FLAGS(func, scx_bpf_events)
#ifdef CONFIG_CGROUP_SCHED
@@ -9639,6 +10559,47 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = {
};
/*
+ * cpu-form kfuncs that are forbidden from cid-form schedulers
+ * (bpf_sched_ext_ops_cid). Programs targeting the cid struct_ops type must
+ * use the cid-form alternative (cid/cmask kfuncs).
+ *
+ * Membership overlaps with scx_kfunc_ids_{any,idle,select_cpu}; the filter
+ * tests this set independently and rejects matches before the per-op
+ * allow-list check runs.
+ *
+ * pahole/resolve_btfids scans every BTF_ID_FLAGS() at build time and
+ * intersects flags across duplicate entries, so each entry must carry the
+ * same flags as the kfunc's primary declaration; otherwise the flags get
+ * dropped globally.
+ */
+BTF_KFUNCS_START(scx_kfunc_ids_cpu_only)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_cpu_only)
+
+/*
* Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc
* group; an op may permit zero or more groups, with the union expressed in
* scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter())
@@ -9647,10 +10608,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = {
*/
enum scx_kf_allow_flags {
SCX_KF_ALLOW_UNLOCKED = 1 << 0,
- SCX_KF_ALLOW_CPU_RELEASE = 1 << 1,
- SCX_KF_ALLOW_DISPATCH = 1 << 2,
- SCX_KF_ALLOW_ENQUEUE = 1 << 3,
- SCX_KF_ALLOW_SELECT_CPU = 1 << 4,
+ SCX_KF_ALLOW_INIT = 1 << 1,
+ SCX_KF_ALLOW_CPU_RELEASE = 1 << 2,
+ SCX_KF_ALLOW_DISPATCH = 1 << 3,
+ SCX_KF_ALLOW_ENQUEUE = 1 << 4,
+ SCX_KF_ALLOW_SELECT_CPU = 1 << 5,
};
/*
@@ -9678,7 +10640,7 @@ static const u32 scx_kf_allow_flags[] = {
[SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED,
[SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED,
[SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED,
- [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED | SCX_KF_ALLOW_INIT,
[SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED,
};
@@ -9693,16 +10655,18 @@ static const u32 scx_kf_allow_flags[] = {
int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id);
+ bool in_init = btf_id_set8_contains(&scx_kfunc_ids_init, kfunc_id);
bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id);
bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id);
bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id);
bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id);
bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id);
bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id);
+ bool in_cpu_only = btf_id_set8_contains(&scx_kfunc_ids_cpu_only, kfunc_id);
u32 moff, flags;
/* Not an SCX kfunc - allow. */
- if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch ||
+ if (!(in_unlocked || in_init || in_select_cpu || in_enqueue || in_dispatch ||
in_cpu_release || in_idle || in_any))
return 0;
@@ -9725,8 +10689,24 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
/*
* Non-SCX struct_ops: SCX kfuncs are not permitted.
+ *
+ * Both bpf_sched_ext_ops (cpu-form) and bpf_sched_ext_ops_cid
+ * (cid-form) are valid SCX struct_ops. Member offsets match between
+ * the two (verified by BUILD_BUG_ON in scx_init()), so the shared
+ * scx_kf_allow_flags[] table indexed by SCX_MOFF_IDX(moff) applies to
+ * both.
+ */
+ if (prog->aux->st_ops != &bpf_sched_ext_ops &&
+ prog->aux->st_ops != &bpf_sched_ext_ops_cid)
+ return -EACCES;
+
+ /*
+ * cid-form schedulers must use cid/cmask kfuncs. cid and cpu are both
+ * small s32s and trivially confused, so cpu-only kfuncs are rejected at
+ * load time. The reverse (cpu-form calling cid-form kfuncs) is
+ * intentionally permissive to ease gradual cpumask -> cid migration.
*/
- if (prog->aux->st_ops != &bpf_sched_ext_ops)
+ if (prog->aux->st_ops == &bpf_sched_ext_ops_cid && in_cpu_only)
return -EACCES;
/* SCX struct_ops: check the per-op allow list. */
@@ -9738,6 +10718,8 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked)
return 0;
+ if ((flags & SCX_KF_ALLOW_INIT) && in_init)
+ return 0;
if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release)
return 0;
if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch)
@@ -9755,6 +10737,73 @@ static int __init scx_init(void)
int ret;
/*
+ * sched_ext_ops_cid mirrors sched_ext_ops up to and including @priv.
+ * Both bpf_scx_init_member() and bpf_scx_check_member() use offsets
+ * from struct sched_ext_ops; sched_ext_ops_cid relies on those offsets
+ * matching for the shared fields. Catch any drift at boot.
+ */
+#define CID_OFFSET_MATCH(cpu_field, cid_field) \
+ BUILD_BUG_ON(offsetof(struct sched_ext_ops, cpu_field) != \
+ offsetof(struct sched_ext_ops_cid, cid_field))
+ /* data fields used by bpf_scx_init_member() */
+ CID_OFFSET_MATCH(dispatch_max_batch, dispatch_max_batch);
+ CID_OFFSET_MATCH(flags, flags);
+ CID_OFFSET_MATCH(name, name);
+ CID_OFFSET_MATCH(timeout_ms, timeout_ms);
+ CID_OFFSET_MATCH(exit_dump_len, exit_dump_len);
+ CID_OFFSET_MATCH(hotplug_seq, hotplug_seq);
+ CID_OFFSET_MATCH(sub_cgroup_id, sub_cgroup_id);
+ /* shared callbacks: the union view requires byte-for-byte offset match */
+ CID_OFFSET_MATCH(enqueue, enqueue);
+ CID_OFFSET_MATCH(dequeue, dequeue);
+ CID_OFFSET_MATCH(dispatch, dispatch);
+ CID_OFFSET_MATCH(tick, tick);
+ CID_OFFSET_MATCH(runnable, runnable);
+ CID_OFFSET_MATCH(running, running);
+ CID_OFFSET_MATCH(stopping, stopping);
+ CID_OFFSET_MATCH(quiescent, quiescent);
+ CID_OFFSET_MATCH(yield, yield);
+ CID_OFFSET_MATCH(core_sched_before, core_sched_before);
+ CID_OFFSET_MATCH(set_weight, set_weight);
+ CID_OFFSET_MATCH(update_idle, update_idle);
+ CID_OFFSET_MATCH(init_task, init_task);
+ CID_OFFSET_MATCH(exit_task, exit_task);
+ CID_OFFSET_MATCH(enable, enable);
+ CID_OFFSET_MATCH(disable, disable);
+ CID_OFFSET_MATCH(dump, dump);
+ CID_OFFSET_MATCH(dump_task, dump_task);
+ CID_OFFSET_MATCH(sub_attach, sub_attach);
+ CID_OFFSET_MATCH(sub_detach, sub_detach);
+ CID_OFFSET_MATCH(init, init);
+ CID_OFFSET_MATCH(exit, exit);
+#ifdef CONFIG_EXT_GROUP_SCHED
+ CID_OFFSET_MATCH(cgroup_init, cgroup_init);
+ CID_OFFSET_MATCH(cgroup_exit, cgroup_exit);
+ CID_OFFSET_MATCH(cgroup_prep_move, cgroup_prep_move);
+ CID_OFFSET_MATCH(cgroup_move, cgroup_move);
+ CID_OFFSET_MATCH(cgroup_cancel_move, cgroup_cancel_move);
+ CID_OFFSET_MATCH(cgroup_set_weight, cgroup_set_weight);
+ CID_OFFSET_MATCH(cgroup_set_bandwidth, cgroup_set_bandwidth);
+ CID_OFFSET_MATCH(cgroup_set_idle, cgroup_set_idle);
+#endif
+ /* renamed callbacks must occupy the same slot as their cpu-form sibling */
+ CID_OFFSET_MATCH(select_cpu, select_cid);
+ CID_OFFSET_MATCH(set_cpumask, set_cmask);
+ CID_OFFSET_MATCH(cpu_online, cid_online);
+ CID_OFFSET_MATCH(cpu_offline, cid_offline);
+ CID_OFFSET_MATCH(dump_cpu, dump_cid);
+ /* @priv tail must align since both share the same data block */
+ CID_OFFSET_MATCH(priv, priv);
+ /*
+ * cid-form must end exactly at @priv - validate_ops() skips
+ * cpu_acquire/cpu_release for cid-form because reading those fields
+ * past the BPF allocation would be UB.
+ */
+ BUILD_BUG_ON(offsetof(struct sched_ext_ops_cid, __end) !=
+ offsetofend(struct sched_ext_ops, priv));
+#undef CID_OFFSET_MATCH
+
+ /*
* kfunc registration can't be done from init_sched_ext_class() as
* register_btf_kfunc_id_set() needs most of the system to be up.
*
@@ -9792,12 +10841,24 @@ static int __init scx_init(void)
return ret;
}
+ ret = scx_cid_kfunc_init();
+ if (ret) {
+ pr_err("sched_ext: Failed to register cid kfuncs (%d)\n", ret);
+ return ret;
+ }
+
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
if (ret) {
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
return ret;
}
+ ret = register_bpf_struct_ops(&bpf_sched_ext_ops_cid, sched_ext_ops_cid);
+ if (ret) {
+ pr_err("sched_ext: Failed to register cid struct_ops (%d)\n", ret);
+ return ret;
+ }
+
ret = register_pm_notifier(&scx_pm_notifier);
if (ret) {
pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c
new file mode 100644
index 000000000000..493c2424f842
--- /dev/null
+++ b/kernel/sched/ext_arena.c
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages.
+ *
+ * Each chunk added to @sch->arena_pool comes from one
+ * bpf_arena_alloc_pages_sleepable() call and is registered at the
+ * kernel-side mapping address. Callers translate to the BPF-arena form
+ * themselves if needed.
+ *
+ * Allocations grow the pool on demand. Underlying arena pages are released
+ * when the arena map itself is torn down.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+
+enum scx_arena_consts {
+ SCX_ARENA_MIN_ORDER = 3, /* 8-byte minimum sub-allocation */
+ SCX_ARENA_GROW_PAGES = 4, /* per growth */
+};
+
+s32 scx_arena_pool_init(struct scx_sched *sch)
+{
+ if (!sch->arena_map)
+ return 0;
+
+ sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE);
+ if (!sch->arena_pool)
+ return -ENOMEM;
+ return 0;
+}
+
+static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk,
+ void *data)
+{
+ int order = pool->min_alloc_order;
+ size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1;
+ unsigned long end_bit = chunk_sz >> order;
+ unsigned long b, e;
+
+ for_each_set_bitrange(b, e, chunk->bits, end_bit)
+ gen_pool_free(pool, chunk->start_addr + (b << order),
+ (e - b) << order);
+}
+
+/*
+ * Tear down the pool. Outstanding gen_pool allocations are freed via
+ * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying
+ * arena pages are released when the arena map itself is torn down.
+ */
+void scx_arena_pool_destroy(struct scx_sched *sch)
+{
+ if (!sch->arena_pool)
+ return;
+ gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL);
+ gen_pool_destroy(sch->arena_pool);
+ sch->arena_pool = NULL;
+}
+
+/*
+ * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and
+ * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable
+ * context.
+ */
+static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt)
+{
+ u64 kern_vm_start;
+ u32 uaddr32;
+ void *p;
+ int ret;
+
+ if (!sch->arena_map || !sch->arena_pool)
+ return -EINVAL;
+
+ p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL,
+ page_cnt, NUMA_NO_NODE, 0);
+ if (!p)
+ return -ENOMEM;
+
+ uaddr32 = (u32)(unsigned long)p;
+ /* arena.o, which defines these, is built only on MMU && 64BIT */
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+ kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map);
+#else
+ kern_vm_start = 0;
+#endif
+
+ ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32,
+ page_cnt * PAGE_SIZE, NUMA_NO_NODE);
+ if (ret) {
+ bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt);
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
+ * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must
+ * be in a GFP_KERNEL context.
+ */
+void *scx_arena_alloc(struct scx_sched *sch, size_t size)
+{
+ unsigned long kern_va;
+ u32 page_cnt;
+
+ might_sleep();
+
+ if (!sch->arena_pool)
+ return NULL;
+
+ while (true) {
+ kern_va = gen_pool_alloc(sch->arena_pool, size);
+ if (kern_va)
+ break;
+ page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
+ (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ if (scx_arena_grow(sch, page_cnt))
+ return NULL;
+ }
+
+ return (void *)kern_va;
+}
+
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size)
+{
+ if (sch->arena_pool && kern_va)
+ gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size);
+}
diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h
new file mode 100644
index 000000000000..4f3610160102
--- /dev/null
+++ b/kernel/sched/ext_arena.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_ARENA_H
+#define _KERNEL_SCHED_EXT_ARENA_H
+
+struct scx_sched;
+
+s32 scx_arena_pool_init(struct scx_sched *sch);
+void scx_arena_pool_destroy(struct scx_sched *sch);
+void *scx_arena_alloc(struct scx_sched *sch, size_t size);
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size);
+
+#endif /* _KERNEL_SCHED_EXT_ARENA_H */
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
new file mode 100644
index 000000000000..66944a7ef79d
--- /dev/null
+++ b/kernel/sched/ext_cid.c
@@ -0,0 +1,707 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#include <linux/cacheinfo.h>
+
+/*
+ * cid tables.
+ *
+ * Pointers are published once on first enable and never revoked. The default
+ * mapping is populated before ops.init() runs; scx_bpf_cid_override() commits
+ * before it returns. As long as the BPF scheduler only uses the tables from
+ * those points onward, it sees a consistent view.
+ */
+s16 *scx_cid_to_cpu_tbl;
+s16 *scx_cpu_to_cid_tbl;
+struct scx_cid_topo *scx_cid_topo;
+
+#define SCX_CID_TOPO_NEG (struct scx_cid_topo) { \
+ .core_cid = -1, .core_idx = -1, .llc_cid = -1, .llc_idx = -1, \
+ .node_cid = -1, .node_idx = -1, \
+}
+
+/*
+ * Return @cpu's LLC shared_cpu_map. If cacheinfo isn't populated (offline or
+ * !present), record @cpu in @fallbacks and return its node mask instead - the
+ * worst that can happen is that the cpu's LLC becomes coarser than reality.
+ */
+static const struct cpumask *cpu_llc_mask(int cpu, struct cpumask *fallbacks)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+
+ if (!ci || !ci->info_list || !ci->num_leaves) {
+ cpumask_set_cpu(cpu, fallbacks);
+ return cpumask_of_node(cpu_to_node(cpu));
+ }
+ return &ci->info_list[ci->num_leaves - 1].shared_cpu_map;
+}
+
+/* Allocate the cid tables once on first enable; never freed. */
+static s32 scx_cid_arrays_alloc(void)
+{
+ u32 npossible = num_possible_cpus();
+ s16 *cid_to_cpu, *cpu_to_cid;
+ struct scx_cid_topo *cid_topo;
+
+ if (scx_cid_to_cpu_tbl)
+ return 0;
+
+ cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
+ cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
+ cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
+
+ if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
+ kfree(cid_to_cpu);
+ kfree(cpu_to_cid);
+ kfree(cid_topo);
+ return -ENOMEM;
+ }
+
+ WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
+ WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
+ WRITE_ONCE(scx_cid_topo, cid_topo);
+ return 0;
+}
+
+/**
+ * scx_cid_init - build the cid mapping
+ * @sch: the scx_sched being initialized; used as the scx_error() target
+ *
+ * See "Topological CPU IDs" in ext_cid.h for the model. Walk online cpus by
+ * intersection at each level (parent_scratch & this_level_mask), which keeps
+ * containment correct by construction and naturally splits a physical LLC
+ * straddling two NUMA nodes into two LLC units. The caller must hold
+ * cpus_read_lock.
+ */
+s32 scx_cid_init(struct scx_sched *sch)
+{
+ cpumask_var_t to_walk __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t node_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t llc_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t core_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t llc_fallback __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t online_no_topo __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ u32 next_cid = 0;
+ s32 next_node_idx = 0, next_llc_idx = 0, next_core_idx = 0;
+ s32 cpu, ret;
+
+ /* CMASK_MAX_WORDS in cid.bpf.h covers NR_CPUS up to 8192 */
+ BUILD_BUG_ON(NR_CPUS > 8192);
+
+ lockdep_assert_cpus_held();
+
+ ret = scx_cid_arrays_alloc();
+ if (ret)
+ return ret;
+
+ if (!zalloc_cpumask_var(&to_walk, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&node_scratch, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&llc_scratch, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&core_scratch, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&llc_fallback, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&online_no_topo, GFP_KERNEL))
+ return -ENOMEM;
+
+ /* -1 sentinels for sparse-possible cpu id holes (0 is a valid cid) */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ scx_cpu_to_cid_tbl[cpu] = -1;
+
+ cpumask_copy(to_walk, cpu_online_mask);
+
+ while (!cpumask_empty(to_walk)) {
+ s32 next_cpu = cpumask_first(to_walk);
+ s32 nid = cpu_to_node(next_cpu);
+ s32 node_cid = next_cid;
+ s32 node_idx;
+
+ /*
+ * No NUMA info: skip and let the tail loop assign a no-topo
+ * cid. cpumask_of_node(-1) is undefined.
+ */
+ if (nid < 0) {
+ cpumask_clear_cpu(next_cpu, to_walk);
+ continue;
+ }
+
+ node_idx = next_node_idx++;
+
+ /* node_scratch = to_walk & this node */
+ cpumask_and(node_scratch, to_walk, cpumask_of_node(nid));
+ if (WARN_ON_ONCE(!cpumask_test_cpu(next_cpu, node_scratch)))
+ return -EINVAL;
+
+ while (!cpumask_empty(node_scratch)) {
+ s32 ncpu = cpumask_first(node_scratch);
+ const struct cpumask *llc_mask = cpu_llc_mask(ncpu, llc_fallback);
+ s32 llc_cid = next_cid;
+ s32 llc_idx = next_llc_idx++;
+
+ /* llc_scratch = node_scratch & this llc */
+ cpumask_and(llc_scratch, node_scratch, llc_mask);
+ if (WARN_ON_ONCE(!cpumask_test_cpu(ncpu, llc_scratch)))
+ return -EINVAL;
+
+ while (!cpumask_empty(llc_scratch)) {
+ s32 lcpu = cpumask_first(llc_scratch);
+ const struct cpumask *sib = topology_sibling_cpumask(lcpu);
+ s32 core_cid = next_cid;
+ s32 core_idx = next_core_idx++;
+ s32 ccpu;
+
+ /* core_scratch = llc_scratch & this core */
+ cpumask_and(core_scratch, llc_scratch, sib);
+ if (WARN_ON_ONCE(!cpumask_test_cpu(lcpu, core_scratch)))
+ return -EINVAL;
+
+ for_each_cpu(ccpu, core_scratch) {
+ s32 cid = next_cid++;
+
+ scx_cid_to_cpu_tbl[cid] = ccpu;
+ scx_cpu_to_cid_tbl[ccpu] = cid;
+ scx_cid_topo[cid] = (struct scx_cid_topo){
+ .core_cid = core_cid,
+ .core_idx = core_idx,
+ .llc_cid = llc_cid,
+ .llc_idx = llc_idx,
+ .node_cid = node_cid,
+ .node_idx = node_idx,
+ };
+
+ cpumask_clear_cpu(ccpu, llc_scratch);
+ cpumask_clear_cpu(ccpu, node_scratch);
+ cpumask_clear_cpu(ccpu, to_walk);
+ }
+ }
+ }
+ }
+
+ /*
+ * No-topo section: any possible cpu without a cid - normally just the
+ * not-online ones. Collect any currently-online cpus that land here in
+ * @online_no_topo so we can warn about them at the end.
+ */
+ for_each_cpu(cpu, cpu_possible_mask) {
+ s32 cid;
+
+ if (__scx_cpu_to_cid(cpu) != -1)
+ continue;
+ if (cpu_online(cpu))
+ cpumask_set_cpu(cpu, online_no_topo);
+
+ cid = next_cid++;
+ scx_cid_to_cpu_tbl[cid] = cpu;
+ scx_cpu_to_cid_tbl[cpu] = cid;
+ scx_cid_topo[cid] = SCX_CID_TOPO_NEG;
+ }
+
+ if (!cpumask_empty(llc_fallback))
+ pr_warn("scx_cid: cpus without cacheinfo, using node mask as llc: %*pbl\n",
+ cpumask_pr_args(llc_fallback));
+ if (!cpumask_empty(online_no_topo))
+ pr_warn("scx_cid: online cpus with no usable topology: %*pbl\n",
+ cpumask_pr_args(online_no_topo));
+
+ return 0;
+}
+
+/**
+ * scx_cmask_clear - Zero every bit in @m's active range
+ * @m: cmask to clear
+ *
+ * Storage past the active range is left as is.
+ */
+void scx_cmask_clear(struct scx_cmask *m)
+{
+ u32 nr_words;
+
+ if (!m->nr_cids)
+ return;
+ nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+ memset(m->bits, 0, nr_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_fill - Set every bit in @m's active range
+ * @m: cmask to fill
+ *
+ * Counterpart to scx_cmask_clear(). Storage past the active range is left as is.
+ */
+void scx_cmask_fill(struct scx_cmask *m)
+{
+ u32 nr_words, head_bits, tail_bits;
+
+ if (!m->nr_cids)
+ return;
+ nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+ memset(m->bits, 0xff, nr_words * sizeof(u64));
+
+ /* clear word-0 bits below base */
+ head_bits = m->base & 63;
+ if (head_bits)
+ m->bits[0] &= ~((1ULL << head_bits) - 1);
+
+ /* clear last-word bits at or past base + nr_cids */
+ tail_bits = (m->base + m->nr_cids) & 63;
+ if (tail_bits)
+ m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1;
+}
+
+/**
+ * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask
+ * @src: source cpumask
+ * @dst: cmask to write
+ *
+ * Clear @dst's active range and set the bit for each cid whose cpu is in
+ * @src and lies within that range. Out-of-range cids are silently ignored.
+ */
+void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst)
+{
+ s32 cpu;
+
+ scx_cmask_clear(dst);
+ for_each_cpu(cpu, src) {
+ s32 cid = __scx_cpu_to_cid(cpu);
+
+ if (cid >= 0)
+ __scx_cmask_set(cid, dst);
+ }
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_cid_override - Install an explicit cpu->cid mapping
+ * @cpu_to_cid: array of nr_cpu_ids s32 entries (cid for each cpu)
+ * @cpu_to_cid__sz: must be nr_cpu_ids * sizeof(s32) bytes
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * May only be called from ops.init() of the root scheduler. Replace the
+ * topology-probed cid mapping with the caller-provided one. Each possible cpu
+ * must map to a unique cid in [0, num_possible_cpus()). Topo info is cleared.
+ * On invalid input, trigger scx_error() to abort the scheduler.
+ */
+__bpf_kfunc void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz,
+ const struct bpf_prog_aux *aux)
+{
+ cpumask_var_t seen __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ struct scx_sched *sch;
+ bool alloced;
+ s32 cpu, cid;
+
+ /* GFP_KERNEL alloc must happen before the rcu read section */
+ alloced = zalloc_cpumask_var(&seen, GFP_KERNEL);
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return;
+
+ if (!alloced) {
+ scx_error(sch, "scx_bpf_cid_override: failed to allocate cpumask");
+ return;
+ }
+
+ if (scx_parent(sch)) {
+ scx_error(sch, "scx_bpf_cid_override() only allowed from root sched");
+ return;
+ }
+
+ if (cpu_to_cid__sz != nr_cpu_ids * sizeof(s32)) {
+ scx_error(sch, "scx_bpf_cid_override: expected %zu bytes, got %u",
+ nr_cpu_ids * sizeof(s32), cpu_to_cid__sz);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ s32 c = cpu_to_cid[cpu];
+
+ if (!cid_valid(sch, c))
+ return;
+ if (cpumask_test_and_set_cpu(c, seen)) {
+ scx_error(sch, "cid %d assigned to multiple cpus", c);
+ return;
+ }
+ scx_cpu_to_cid_tbl[cpu] = c;
+ scx_cid_to_cpu_tbl[c] = cpu;
+ }
+
+ /* Invalidate stale topo info - the override carries no topology. */
+ for (cid = 0; cid < num_possible_cpus(); cid++)
+ scx_cid_topo[cid] = SCX_CID_TOPO_NEG;
+}
+
+/**
+ * scx_bpf_cid_to_cpu - Return the raw CPU id for @cid
+ * @cid: cid to look up
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Return the raw CPU id for @cid. Trigger scx_error() and return -EINVAL if
+ * @cid is invalid. The cid<->cpu mapping is static for the lifetime of the
+ * loaded scheduler, so the BPF side can cache the result to avoid repeated
+ * kfunc invocations.
+ */
+__bpf_kfunc s32 scx_bpf_cid_to_cpu(s32 cid, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return -EINVAL;
+ return scx_cid_to_cpu(sch, cid);
+}
+
+/**
+ * scx_bpf_cpu_to_cid - Return the cid for @cpu
+ * @cpu: cpu to look up
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Return the cid for @cpu. Trigger scx_error() and return -EINVAL if @cpu is
+ * invalid. The cid<->cpu mapping is static for the lifetime of the loaded
+ * scheduler, so the BPF side can cache the result to avoid repeated kfunc
+ * invocations.
+ */
+__bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return -EINVAL;
+ return scx_cpu_to_cid(sch, cpu);
+}
+
+/*
+ * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating
+ * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms;
+ * cmask_walk_op1() does the same shape over a single cmask range. Every public
+ * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and
+ * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the
+ * selected op and cmask_op2_is_pred() folds the predicate early-exit out of
+ * mutating ops.
+ *
+ * Two-cmask ops only touch @dst bits inside the intersection of the two ranges;
+ * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero
+ * @dst bits that lie outside @src's range.
+ *
+ * The _RACY variants are otherwise identical to their non-racy counterpart but
+ * read @src word-by-word via data_race(). Memory ordering with concurrent
+ * writers is the caller's responsibility.
+ */
+enum cmask_op2 {
+ /* mutating */
+ CMASK_OP2_AND,
+ CMASK_OP2_OR,
+ CMASK_OP2_OR_RACY,
+ CMASK_OP2_COPY,
+ CMASK_OP2_COPY_RACY,
+ CMASK_OP2_ANDNOT,
+ /* predicates - short-circuit when the per-word result is true */
+ CMASK_OP2_SUBSET,
+ CMASK_OP2_INTERSECTS,
+};
+
+static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op)
+{
+ return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS;
+}
+
+static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask,
+ const enum cmask_op2 op)
+{
+ switch (op) {
+ case CMASK_OP2_AND:
+ *av &= ~mask | *bp;
+ return false;
+ case CMASK_OP2_OR:
+ *av |= *bp & mask;
+ return false;
+ case CMASK_OP2_OR_RACY:
+ *av |= data_race(*bp) & mask;
+ return false;
+ case CMASK_OP2_COPY:
+ *av = (*av & ~mask) | (*bp & mask);
+ return false;
+ case CMASK_OP2_COPY_RACY:
+ *av = (*av & ~mask) | (data_race(*bp) & mask);
+ return false;
+ case CMASK_OP2_ANDNOT:
+ *av &= ~(*bp & mask);
+ return false;
+ case CMASK_OP2_SUBSET:
+ /* stop on the first bit in @sub not set in @super */
+ return (*bp & ~*av) & mask;
+ case CMASK_OP2_INTERSECTS:
+ return (*av & *bp) & mask;
+ }
+ unreachable();
+}
+
+/*
+ * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base,
+ * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words
+ * and return false; predicates return true on the first word whose per-word
+ * test is true. Empty intersection returns false (matches "no bits to consider"
+ * for both mutate and predicate).
+ *
+ * Base/nr_cids are taken as parameters so callers with snapshotted bounds can
+ * drive the walk with values independent of the cmask's header.
+ */
+static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids,
+ const u64 *b_bits, u32 b_base, u32 b_nr_cids,
+ const enum cmask_op2 op)
+{
+ u32 lo = max(a_base, b_base);
+ u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids);
+ u32 a_word_off = a_base / 64;
+ u32 b_word_off = b_base / 64;
+ u32 lo_word = lo / 64;
+ u32 hi_word = (hi - 1) / 64;
+ u64 head_mask = GENMASK_U64(63, lo & 63);
+ u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+ u32 w;
+
+ if (lo >= hi)
+ return false;
+
+ if (lo_word == hi_word)
+ return cmask_word_op2(&a_bits[lo_word - a_word_off],
+ &b_bits[lo_word - b_word_off],
+ head_mask & tail_mask, op);
+
+ if (cmask_word_op2(&a_bits[lo_word - a_word_off],
+ &b_bits[lo_word - b_word_off], head_mask, op) &&
+ cmask_op2_is_pred(op))
+ return true;
+
+ for (w = lo_word + 1; w < hi_word; w++)
+ if (cmask_word_op2(&a_bits[w - a_word_off],
+ &b_bits[w - b_word_off], ~0ULL, op) &&
+ cmask_op2_is_pred(op))
+ return true;
+
+ return cmask_word_op2(&a_bits[hi_word - a_word_off],
+ &b_bits[hi_word - b_word_off], tail_mask, op);
+}
+
+enum cmask_op1 {
+ CMASK_OP1_ANY_SET,
+};
+
+static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask,
+ const enum cmask_op1 op)
+{
+ switch (op) {
+ case CMASK_OP1_ANY_SET:
+ return *ap & mask;
+ }
+ unreachable();
+}
+
+/*
+ * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op.
+ * Returns true on the first word whose per-word test is true; returns false if
+ * no word matches or the range is empty. All current op1s short-circuit on
+ * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred()
+ * guard analogous to cmask_op2_is_pred().
+ */
+static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base,
+ u32 a_nr_cids,
+ const enum cmask_op1 op)
+{
+ u32 lo = a_base;
+ u32 hi = a_base + a_nr_cids;
+ u32 a_word_off = a_base / 64;
+ u32 lo_word = lo / 64;
+ u32 hi_word = (hi - 1) / 64;
+ u64 head_mask = GENMASK_U64(63, lo & 63);
+ u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+ u32 w;
+
+ if (lo >= hi)
+ return false;
+
+ if (lo_word == hi_word)
+ return cmask_word_op1(&a_bits[lo_word - a_word_off],
+ head_mask & tail_mask, op);
+
+ if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op))
+ return true;
+ for (w = lo_word + 1; w < hi_word; w++)
+ if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op))
+ return true;
+ return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op);
+}
+
+void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_AND);
+}
+
+void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_OR);
+}
+
+/**
+ * scx_cmask_or_racy - OR @src into @dst, reading @src without locking
+ *
+ * @src is read word-by-word through data_race(). Same per-bit independence
+ * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the
+ * caller's responsibility.
+ */
+void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY);
+}
+
+void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_COPY);
+}
+
+/**
+ * scx_cmask_copy_racy - Snapshot @src into @dst without locking
+ *
+ * @src is read word-by-word through data_race(). Head/tail masking matches
+ * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates
+ * just leave some bits fresher than others. Memory ordering with writers is
+ * the caller's responsibility.
+ */
+void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY);
+}
+
+void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT);
+}
+
+/*
+ * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure
+ * [@lo, @hi) is contained in @cm's range.
+ */
+static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi)
+{
+ if (lo >= hi)
+ return false;
+ return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo,
+ CMASK_OP1_ANY_SET);
+}
+
+/**
+ * scx_cmask_subset - test whether @sub is a subset of @super
+ * @sub: cmask to test
+ * @super: cmask to test against
+ *
+ * Return true iff every set bit of @sub is also set in @super.
+ */
+bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super)
+{
+ u32 super_end = super->base + super->nr_cids;
+ u32 sub_end = sub->base + sub->nr_cids;
+
+ /*
+ * Set bits in @sub outside @super's range can't be in @super, so any
+ * such bit means not a subset. The walk below only visits words
+ * common to both ranges, so these need a separate scan.
+ */
+ if (sub->base < super->base &&
+ cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end)))
+ return false;
+ if (sub_end > super_end &&
+ cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end))
+ return false;
+
+ return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids,
+ sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET);
+}
+
+bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b)
+{
+ return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids,
+ b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS);
+}
+
+/**
+ * scx_cmask_empty - Test whether @m has no bits set
+ * @m: cmask to test
+ *
+ * Return true iff @m's active range has no bits set.
+ */
+bool scx_cmask_empty(const struct scx_cmask *m)
+{
+ return !cmask_any_set_in_range(m, m->base, m->base + m->nr_cids);
+}
+
+/**
+ * scx_bpf_cid_topo - Copy out per-cid topology info
+ * @cid: cid to look up
+ * @out__uninit: where to copy the topology info; fully written by this call
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Fill @out__uninit with the topology info for @cid. Trigger scx_error() if
+ * @cid is out of range. If @cid is valid but in the no-topo section, all fields
+ * are set to -1.
+ */
+__bpf_kfunc void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out__uninit,
+ const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch) || !cid_valid(sch, cid)) {
+ *out__uninit = SCX_CID_TOPO_NEG;
+ return;
+ }
+
+ *out__uninit = READ_ONCE(scx_cid_topo)[cid];
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_init)
+BTF_ID_FLAGS(func, scx_bpf_cid_override, KF_IMPLICIT_ARGS | KF_SLEEPABLE)
+BTF_KFUNCS_END(scx_kfunc_ids_init)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_init = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_init,
+ .filter = scx_kfunc_context_filter,
+};
+
+BTF_KFUNCS_START(scx_kfunc_ids_cid)
+BTF_ID_FLAGS(func, scx_bpf_cid_to_cpu, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpu_to_cid, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cid_topo, KF_IMPLICIT_ARGS)
+BTF_KFUNCS_END(scx_kfunc_ids_cid)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_cid = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_cid,
+};
+
+int scx_cid_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_init) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_cid) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_cid) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_cid);
+}
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
new file mode 100644
index 000000000000..5745e5785e89
--- /dev/null
+++ b/kernel/sched/ext_cid.h
@@ -0,0 +1,271 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Topological CPU IDs (cids)
+ * --------------------------
+ *
+ * Raw cpu numbers are clumsy for sharding work and communication across
+ * topology units, especially from BPF: the space can be sparse, numerical
+ * closeness doesn't imply topological closeness (x86 hyperthreading often puts
+ * SMT siblings far apart), and a range of cpu ids doesn't mean anything.
+ * Sub-scheds make this acute - cpu allocation, revocation and other state are
+ * constantly communicated across sub-scheds, and passing whole cpumasks scales
+ * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length
+ * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences
+ * for every op.
+ *
+ * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or
+ * NUMA node get contiguous cid ranges, so a topology unit becomes a (start,
+ * length) slice of cid space. Communication can pass a slice instead of a
+ * cpumask, and BPF code can process, for example, a u64 word's worth of cids at
+ * a time.
+ *
+ * The mapping is built once at root scheduler enable time by walking the
+ * topology of online cpus only. Going by online cpus is out of necessity:
+ * depending on the arch, topology info isn't reliably available for offline
+ * cpus. The expected usage model is restarting the scheduler on hotplug events
+ * so the mapping is rebuilt against the new online set. A scheduler that wants
+ * to handle hotplug without a restart can provide its own cid and shard mapping
+ * through the override interface.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_CID_H
+#define _KERNEL_SCHED_EXT_CID_H
+
+struct scx_sched;
+
+/*
+ * Cid space (total is always num_possible_cpus()) is laid out with
+ * topology-annotated cids first, then no-topo cids at the tail. The
+ * topology-annotated block covers the cpus that were online when scx_cid_init()
+ * ran and remains valid even after those cpus go offline. The tail block covers
+ * possible-but-not-online cpus and carries all-(-1) topo info (see
+ * scx_cid_topo); callers detect it via the -1 sentinels.
+ *
+ * See the comment above the table definitions in ext_cid.c for the
+ * memory-ordering and visibility contract.
+ */
+extern s16 *scx_cid_to_cpu_tbl;
+extern s16 *scx_cpu_to_cid_tbl;
+extern struct scx_cid_topo *scx_cid_topo;
+extern struct btf_id_set8 scx_kfunc_ids_init;
+
+void scx_cmask_clear(struct scx_cmask *m);
+void scx_cmask_fill(struct scx_cmask *m);
+void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src);
+bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super);
+bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b);
+bool scx_cmask_empty(const struct scx_cmask *m);
+s32 scx_cid_init(struct scx_sched *sch);
+int scx_cid_kfunc_init(void);
+void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
+
+/**
+ * cid_valid - Verify a cid value, to be used on ops input args
+ * @sch: scx_sched to abort on error
+ * @cid: cid which came from a BPF ops
+ *
+ * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger
+ * scx_error() and return false.
+ */
+static inline bool cid_valid(struct scx_sched *sch, s32 cid)
+{
+ if (likely(cid >= 0 && cid < num_possible_cpus()))
+ return true;
+ scx_error(sch, "invalid cid %d", cid);
+ return false;
+}
+
+/**
+ * __scx_cid_to_cpu - Unchecked cid->cpu table lookup
+ * @cid: cid to look up. Must be in [0, num_possible_cpus()).
+ *
+ * Intended for callsites that have already validated @cid and that hold a
+ * non-NULL @sch from scx_prog_sched() - a live sched implies the table has
+ * been allocated, so no NULL check is needed here.
+ */
+static inline s32 __scx_cid_to_cpu(s32 cid)
+{
+ /* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */
+ return READ_ONCE(scx_cid_to_cpu_tbl)[cid];
+}
+
+/**
+ * __scx_cpu_to_cid - Unchecked cpu->cid table lookup
+ * @cpu: cpu to look up. Must be a valid possible cpu id.
+ *
+ * Same usage constraints as __scx_cid_to_cpu().
+ */
+static inline s32 __scx_cpu_to_cid(s32 cpu)
+{
+ return READ_ONCE(scx_cpu_to_cid_tbl)[cpu];
+}
+
+/**
+ * scx_cid_to_cpu - Translate @cid to its cpu
+ * @sch: scx_sched for error reporting
+ * @cid: cid to look up
+ *
+ * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers
+ * scx_error() on @sch. The cid arrays are allocated on first scheduler enable
+ * and never freed, so the returned cpu is stable for the lifetime of the loaded
+ * scheduler.
+ */
+static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid)
+{
+ if (!cid_valid(sch, cid))
+ return -EINVAL;
+ return __scx_cid_to_cpu(cid);
+}
+
+/**
+ * scx_cpu_to_cid - Translate @cpu to its cid
+ * @sch: scx_sched for error reporting
+ * @cpu: cpu to look up
+ *
+ * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers
+ * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu().
+ */
+static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
+{
+ if (!scx_cpu_valid(sch, cpu, NULL))
+ return -EINVAL;
+ return __scx_cpu_to_cid(cpu);
+}
+
+/**
+ * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form
+ */
+static inline bool scx_is_cid_type(void)
+{
+ return static_branch_unlikely(&__scx_is_cid_type);
+}
+
+static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m)
+{
+ return likely(cid >= m->base && cid < m->base + m->nr_cids);
+}
+
+/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
+static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m)
+{
+ return (u64 *)&m->bits[cid / 64 - m->base / 64];
+}
+
+/**
+ * __scx_cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
+ u32 alloc_cids)
+{
+ if (WARN_ON_ONCE(alloc_cids < nr_cids))
+ nr_cids = alloc_cids;
+
+ m->base = base;
+ m->nr_cids = nr_cids;
+ m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
+ memset(m->bits, 0, m->alloc_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
+static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
+{
+ __scx_cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * scx_cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
+{
+ if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
+ return;
+
+ if (nr_cids) {
+ u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+ m->bits[0] = 0;
+ m->bits[last_word] = 0;
+ }
+
+ m->base = base;
+ m->nr_cids = nr_cids;
+}
+
+static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m)
+{
+ if (!__scx_cmask_contains(cid, m))
+ return;
+ *__scx_cmask_word(cid, m) |= BIT_U64(cid & 63);
+}
+
+/**
+ * scx_cmask_test - test whether @cid is set in @m
+ * @cid: cid to test
+ * @m: cmask to test
+ *
+ * Return %false if @cid is outside @m's active range. Otherwise return the
+ * bit's value. Read via READ_ONCE so callers can race set/clear writers.
+ */
+static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m)
+{
+ if (!__scx_cmask_contains(cid, m))
+ return false;
+ return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63);
+}
+
+/*
+ * Words of bits[] the active range spans, 0 if empty. Tighter than the storage
+ * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment.
+ */
+static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m)
+{
+ if (!m->nr_cids)
+ return 0;
+ return ((m->base & 63) + m->nr_cids - 1) / 64 + 1;
+}
+
+/**
+ * scx_cmask_for_each_cid - iterate set cids in @m
+ * @cid: s32 loop var that receives each set cid in turn
+ * @m: cmask to iterate
+ *
+ * Visits set bits within @m's active range in ascending order. Scans only the
+ * words the active range spans, where head and tail padding is kept zero, so
+ * no per-cid range check is needed.
+ */
+#define scx_cmask_for_each_cid(cid, m) \
+ for (u64 __bs = (m)->base & ~63u, __wi = 0, \
+ __nw = scx_cmask_nr_used_words(m); \
+ __wi < __nw; __wi++) \
+ for (u64 __w = READ_ONCE((m)->bits[__wi]); \
+ __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true); \
+ __w &= __w - 1)
+
+#endif /* _KERNEL_SCHED_EXT_CID_H */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 7468560a6d80..2077373d8da3 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -9,7 +9,6 @@
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
* Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
*/
-#include "ext_idle.h"
/* Enable/disable built-in idle CPU selection policy */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
@@ -79,7 +78,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu)
int node = scx_cpu_node_if_enabled(cpu);
struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
-#ifdef CONFIG_SCHED_SMT
/*
* SMT mask should be cleared whether we can claim @cpu or not. The SMT
* cluster is not wholly idle either way. This also prevents
@@ -104,7 +102,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu)
else if (cpumask_test_cpu(cpu, idle_smts))
__cpumask_clear_cpu(cpu, idle_smts);
}
-#endif
return cpumask_test_and_clear_cpu(cpu, idle_cpus);
}
@@ -466,12 +463,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
preempt_disable();
/*
- * Check whether @prev_cpu is still within the allowed set. If not,
- * we can still try selecting a nearby CPU.
- */
- is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
-
- /*
* Determine the subset of CPUs usable by @p within @cpus_allowed.
*/
if (allowed != p->cpus_ptr) {
@@ -488,6 +479,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
}
/*
+ * Check whether @prev_cpu is still within the allowed set. If not,
+ * we can still try selecting a nearby CPU.
+ */
+ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+
+ /*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();
@@ -622,7 +619,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
goto out_unlock;
}
-#ifdef CONFIG_SCHED_SMT
/*
* Use @prev_cpu's sibling if it's idle.
*/
@@ -634,7 +630,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
goto out_unlock;
}
}
-#endif
/*
* Search for any idle CPU in the same LLC domain.
@@ -714,7 +709,6 @@ static void update_builtin_idle(int cpu, bool idle)
assign_cpu(cpu, idle_cpus, idle);
-#ifdef CONFIG_SCHED_SMT
if (sched_smt_active()) {
const struct cpumask *smt = cpu_smt_mask(cpu);
struct cpumask *idle_smts = idle_cpumask(node)->smt;
@@ -731,7 +725,6 @@ static void update_builtin_idle(int cpu, bool idle)
cpumask_andnot(idle_smts, idle_smts, smt);
}
}
-#endif
}
/*
@@ -789,7 +782,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
*/
if (SCX_HAS_OP(sch, update_idle) && do_notify &&
!scx_bypassing(sch, cpu_of(rq)))
- SCX_CALL_OP(sch, update_idle, rq, cpu_of(rq), idle);
+ SCX_CALL_OP(sch, update_idle, rq, scx_cpu_arg(cpu_of(rq)), idle);
}
static void reset_idle_masks(struct sched_ext_ops *ops)
@@ -917,7 +910,7 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
bool we_locked = false;
s32 cpu;
- if (!ops_cpu_valid(sch, prev_cpu, NULL))
+ if (!scx_cpu_valid(sch, prev_cpu, NULL))
return -EINVAL;
if (!check_builtin_idle_enabled(sch))
@@ -990,7 +983,7 @@ __bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux)
guard(rcu)();
sch = scx_prog_sched(aux);
- if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
+ if (unlikely(!sch) || !scx_cpu_valid(sch, cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
}
@@ -1272,7 +1265,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_
if (!check_builtin_idle_enabled(sch))
return false;
- if (!ops_cpu_valid(sch, cpu, NULL))
+ if (!scx_cpu_valid(sch, cpu, NULL))
return false;
return scx_idle_test_and_clear_cpu(cpu);
@@ -1510,13 +1503,9 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
int scx_idle_init(void)
{
- int ret;
-
- ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu);
-
- return ret;
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu);
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index a075732d4430..b04701190b23 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -8,35 +8,6 @@
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
#define SCX_MOFF_IDX(moff) ((moff) / sizeof(void (*)(void)))
-enum scx_consts {
- SCX_DSP_DFL_MAX_BATCH = 32,
- SCX_DSP_MAX_LOOPS = 32,
- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
-
- SCX_EXIT_BT_LEN = 64,
- SCX_EXIT_MSG_LEN = 1024,
- SCX_EXIT_DUMP_DFL_LEN = 32768,
-
- SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
-
- /*
- * Iterating all tasks may take a while. Periodically drop
- * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
- */
- SCX_TASK_ITER_BATCH = 32,
-
- SCX_BYPASS_HOST_NTH = 2,
-
- SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
- SCX_BYPASS_LB_DONOR_PCT = 125,
- SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
- SCX_BYPASS_LB_BATCH = 256,
-
- SCX_REENQ_LOCAL_MAX_REPEAT = 256,
-
- SCX_SUB_MAX_DEPTH = 4,
-};
-
enum scx_exit_kind {
SCX_EXIT_NONE,
SCX_EXIT_DONE,
@@ -94,6 +65,12 @@ struct scx_exit_info {
/* %SCX_EXIT_* - broad category of the exit reason */
enum scx_exit_kind kind;
+ /*
+ * CPU that initiated the exit, valid once @kind has been set.
+ * Negative if the exit path didn't identify a CPU.
+ */
+ s32 exit_cpu;
+
/* exit code if gracefully exiting */
s64 exit_code;
@@ -138,7 +115,8 @@ enum scx_ops_flags {
* To mask this problem, by default, unhashed tasks are automatically
* dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
* depend on pid lookups and wants to handle these tasks directly, the
- * following flag can be used.
+ * following flag can be used. With %SCX_OPS_TID_TO_TASK,
+ * scx_bpf_tid_to_task() can find exiting tasks reliably.
*/
SCX_OPS_ENQ_EXITING = 1LLU << 2,
@@ -189,6 +167,17 @@ enum scx_ops_flags {
*/
SCX_OPS_ALWAYS_ENQ_IMMED = 1LLU << 7,
+ /*
+ * Maintain a mapping from p->scx.tid to task_struct so the BPF
+ * scheduler can recover task pointers from stored tids via
+ * scx_bpf_tid_to_task().
+ *
+ * Only the root scheduler turns this on. A sub-sched may set the flag
+ * to declare a dependency on the lookup; if the root scheduler hasn't
+ * enabled it, attaching the sub-sched is rejected.
+ */
+ SCX_OPS_TID_TO_TASK = 1LLU << 8,
+
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
@@ -196,7 +185,8 @@ enum scx_ops_flags {
SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_ALWAYS_ENQ_IMMED,
+ SCX_OPS_ALWAYS_ENQ_IMMED |
+ SCX_OPS_TID_TO_TASK,
/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
__SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
@@ -540,28 +530,6 @@ struct sched_ext_ops {
void (*update_idle)(s32 cpu, bool idle);
/**
- * @cpu_acquire: A CPU is becoming available to the BPF scheduler
- * @cpu: The CPU being acquired by the BPF scheduler.
- * @args: Acquire arguments, see the struct definition.
- *
- * A CPU that was previously released from the BPF scheduler is now once
- * again under its control.
- */
- void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-
- /**
- * @cpu_release: A CPU is taken away from the BPF scheduler
- * @cpu: The CPU being released by the BPF scheduler.
- * @args: Release arguments, see the struct definition.
- *
- * The specified CPU is no longer under the control of the BPF
- * scheduler. This could be because it was preempted by a higher
- * priority sched_class, though there may be other reasons as well. The
- * caller should consult @args->reason to determine the cause.
- */
- void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-
- /**
* @init_task: Initialize a task to run in a BPF scheduler
* @p: task to initialize for BPF scheduling
* @args: init arguments, see the struct definition
@@ -851,6 +819,128 @@ struct sched_ext_ops {
/* internal use only, must be NULL */
void __rcu *priv;
+
+ /*
+ * Deprecated callbacks. Kept at the end of the struct so the cid-form
+ * struct (sched_ext_ops_cid) can omit them without affecting the
+ * shared field offsets. Use SCX_ENQ_IMMED instead. Sitting past
+ * SCX_OPI_END means has_op doesn't cover them, so SCX_HAS_OP() cannot
+ * be used; callers must test sch->ops.cpu_acquire / cpu_release
+ * directly.
+ */
+
+ /**
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control. Deprecated; use SCX_ENQ_IMMED instead.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * @cpu_release: A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ * Deprecated; use SCX_ENQ_IMMED instead.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+};
+
+/**
+ * struct sched_ext_ops_cid - cid-form alternative to struct sched_ext_ops
+ *
+ * Mirrors struct sched_ext_ops with cpu/cpumask substituted with cid/cmask
+ * where applicable. Layout up to and including @priv matches sched_ext_ops
+ * byte-for-byte (verified by BUILD_BUG_ON checks at scx_init() time) so
+ * shared field offsets work for both struct types in bpf_scx_init_member()
+ * and bpf_scx_check_member(). The deprecated cpu_acquire/cpu_release
+ * callbacks at the tail of sched_ext_ops are omitted here entirely.
+ *
+ * Differences from sched_ext_ops:
+ * - select_cpu -> select_cid (returns cid)
+ * - dispatch -> dispatch (cpu arg is now cid)
+ * - update_idle -> update_idle (cpu arg is now cid)
+ * - set_cpumask -> set_cmask (cmask instead of cpumask)
+ * - cpu_online -> cid_online
+ * - cpu_offline -> cid_offline
+ * - dump_cpu -> dump_cid
+ * - cpu_acquire/cpu_release -> not present (deprecated in sched_ext_ops)
+ *
+ * BPF schedulers using this type cannot call cpu-form scx_bpf_* kfuncs;
+ * use the cid-form variants instead. Enforced at BPF verifier time via
+ * scx_kfunc_context_filter() branching on prog->aux->st_ops.
+ *
+ * See sched_ext_ops for callback documentation.
+ */
+struct sched_ext_ops_cid {
+ s32 (*select_cid)(struct task_struct *p, s32 prev_cid, u64 wake_flags);
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+ void (*dispatch)(s32 cid, struct task_struct *prev);
+ void (*tick)(struct task_struct *p);
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+ void (*running)(struct task_struct *p);
+ void (*stopping)(struct task_struct *p, bool runnable);
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+ bool (*core_sched_before)(struct task_struct *a,
+ struct task_struct *b);
+ void (*set_weight)(struct task_struct *p, u32 weight);
+ void (*set_cmask)(struct task_struct *p,
+ const struct scx_cmask *cmask);
+ void (*update_idle)(s32 cid, bool idle);
+ s32 (*init_task)(struct task_struct *p,
+ struct scx_init_task_args *args);
+ void (*exit_task)(struct task_struct *p,
+ struct scx_exit_task_args *args);
+ void (*enable)(struct task_struct *p);
+ void (*disable)(struct task_struct *p);
+ void (*dump)(struct scx_dump_ctx *ctx);
+ void (*dump_cid)(struct scx_dump_ctx *ctx, s32 cid, bool idle);
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+#ifdef CONFIG_EXT_GROUP_SCHED
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+ void (*cgroup_exit)(struct cgroup *cgrp);
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+ void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
+#endif /* CONFIG_EXT_GROUP_SCHED */
+ s32 (*sub_attach)(struct scx_sub_attach_args *args);
+ void (*sub_detach)(struct scx_sub_detach_args *args);
+ void (*cid_online)(s32 cid);
+ void (*cid_offline)(s32 cid);
+ s32 (*init)(void);
+ void (*exit)(struct scx_exit_info *info);
+
+ /* Data fields - must match sched_ext_ops layout exactly */
+ u32 dispatch_max_batch;
+ u64 flags;
+ u32 timeout_ms;
+ u32 exit_dump_len;
+ u64 hotplug_seq;
+ u64 sub_cgroup_id;
+ char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void __rcu *priv;
+
+ /* layout end anchor for the BUILD_BUG_ON in scx_init(); keep last */
+ char __end[0];
};
enum scx_opi {
@@ -1009,7 +1099,40 @@ struct scx_sched_pnode {
};
struct scx_sched {
- struct sched_ext_ops ops;
+ /*
+ * cpu-form and cid-form ops share field offsets up to .priv (verified
+ * by BUILD_BUG_ON in scx_init()). The anonymous union lets the kernel
+ * access either view of the same storage without function-pointer
+ * casts: use .ops for cpu-form and shared fields, .ops_cid for the
+ * cid-renamed callbacks (set_cmask, select_cid, cid_online, ...).
+ */
+ union {
+ struct sched_ext_ops ops;
+ struct sched_ext_ops_cid ops_cid;
+ };
+ bool is_cid_type; /* true if registered via bpf_sched_ext_ops_cid */
+
+ /*
+ * Arena map auto-discovered from member progs at struct_ops attach.
+ * cid-form schedulers must use exactly one arena across all member
+ * progs. NULL on cpu-form.
+ *
+ * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added
+ * at the kernel-side mapping address. @arena_kern_base is the start
+ * of the arena's kern_vm range. See scx_arena_to_kaddr() and
+ * scx_kaddr_to_arena().
+ */
+ struct bpf_map *arena_map;
+ struct gen_pool *arena_pool;
+ uintptr_t arena_kern_base;
+
+ /*
+ * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+ * to ops_cid.set_cmask(). The kernel writes through the stored kern_va
+ * and hands BPF its arena pointer via scx_kaddr_to_arena().
+ */
+ struct scx_cmask * __percpu *set_cmask_scratch;
+
DECLARE_BITMAP(has_op, SCX_OPI_END);
/*
@@ -1083,6 +1206,31 @@ struct scx_sched {
struct scx_sched *ancestors[];
};
+/**
+ * scx_arena_to_kaddr - Translate a BPF-arena pointer to its kernel address
+ * @sch: scheduler whose arena hosts @bpf_ptr
+ * @bpf_ptr: BPF-arena pointer, only the low 32 bits are used
+ *
+ * The (u32) cast normalizes any input into the arena's 4 GiB kern_vm range,
+ * which combined with scratch-page fault recovery makes the returned pointer
+ * safe to dereference up to GUARD_SZ / 2 past the intended object. Accesses
+ * larger than GUARD_SZ / 2 must be explicitly bounds-checked.
+ */
+static inline void *scx_arena_to_kaddr(struct scx_sched *sch, const void *bpf_ptr)
+{
+ return (void *)(sch->arena_kern_base + (u32)(uintptr_t)bpf_ptr);
+}
+
+/**
+ * scx_kaddr_to_arena - Translate a kernel arena address to its BPF form
+ * @sch: scheduler whose arena hosts @kaddr
+ * @kaddr: kernel-side arena address, supplied by trusted kernel code
+ */
+static inline void *scx_kaddr_to_arena(struct scx_sched *sch, const void *kaddr)
+{
+ return (void *)((uintptr_t)kaddr - sch->arena_kern_base);
+}
+
enum scx_wake_flags {
/* expose select WF_* flags as enums */
SCX_WAKE_FORK = WF_FORK,
@@ -1366,8 +1514,30 @@ enum scx_ops_state {
extern struct scx_sched __rcu *scx_root;
DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+/*
+ * True when the currently loaded scheduler hierarchy is cid-form. All scheds
+ * in a hierarchy share one form, so this single key tells callsites which
+ * view to use without per-sch dereferences. Use scx_is_cid_type() to test.
+ */
+DECLARE_STATIC_KEY_FALSE(__scx_is_cid_type);
+
int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id);
+bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where);
+
+__printf(5, 0) bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, s32 exit_cpu, const char *fmt,
+ va_list args);
+__printf(5, 6) bool __scx_exit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, s32 exit_cpu, const char *fmt, ...);
+
+#define scx_exit(sch, kind, exit_code, fmt, args...) \
+ __scx_exit(sch, kind, exit_code, raw_smp_processor_id(), fmt, ##args)
+#define scx_error(sch, fmt, args...) \
+ scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_verror(sch, fmt, args) \
+ scx_vexit((sch), SCX_EXIT_ERROR, 0, raw_smp_processor_id(), fmt, args)
+
/*
* Return the rq currently locked from an scx callback, or NULL if no rq is
* locked.
@@ -1476,7 +1646,7 @@ static inline bool scx_task_on_sched(struct scx_sched *sch,
return true;
}
-static struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux)
+static inline struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux)
{
return rcu_dereference_all(scx_root);
}
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
new file mode 100644
index 000000000000..8b3527e21fca
--- /dev/null
+++ b/kernel/sched/ext_types.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Early sched_ext type definitions.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_TYPES_H
+#define _KERNEL_SCHED_EXT_TYPES_H
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ /* per-CPU chunk size for p->scx.tid allocation, see scx_alloc_tid() */
+ SCX_TID_CHUNK = 1024,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_TASK_ITER_BATCH = 32,
+
+ SCX_BYPASS_HOST_NTH = 2,
+
+ SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
+ SCX_BYPASS_LB_DONOR_PCT = 125,
+ SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
+ SCX_BYPASS_LB_BATCH = 256,
+
+ SCX_REENQ_LOCAL_MAX_REPEAT = 256,
+
+ SCX_SUB_MAX_DEPTH = 4,
+};
+
+/*
+ * Per-cid topology info. For each topology level (core, LLC, node), records
+ * the first cid in the unit and its global index. Global indices are
+ * consecutive integers assigned in cid-walk order, so e.g. core_idx ranges
+ * over [0, nr_cores_at_init) with no gaps. No-topo cids have all fields set
+ * to -1.
+ *
+ * @core_cid: first cid of this cid's core (smt-sibling group)
+ * @core_idx: global index of that core, in [0, nr_cores_at_init)
+ * @llc_cid: first cid of this cid's LLC
+ * @llc_idx: global index of that LLC, in [0, nr_llcs_at_init)
+ * @node_cid: first cid of this cid's NUMA node
+ * @node_idx: global index of that node, in [0, nr_nodes_at_init)
+ */
+struct scx_cid_topo {
+ s32 core_cid;
+ s32 core_idx;
+ s32 llc_cid;
+ s32 llc_idx;
+ s32 node_cid;
+ s32 node_idx;
+};
+
+/*
+ * cmask: variable-length, base-windowed bitmap over cid space
+ * -----------------------------------------------------------
+ *
+ * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the
+ * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
+ * first (base & 63) bits of bits[0] are head padding and the trailing bits of
+ * the last active word past base + nr_cids are tail padding. Both stay zero;
+ * all mutating helpers preserve that. Words past the last active word are not
+ * read by any helper and have no constraint.
+ *
+ * Grid alignment means two cmasks always address bits[] against the same global
+ * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to
+ *
+ * dst->bits[i] OP= src->bits[i - delta]
+ *
+ * with no bit-shifting, regardless of how the two bases relate mod 64.
+ */
+struct scx_cmask {
+ u32 base;
+ u32 nr_cids;
+ u32 alloc_words;
+ u64 bits[] __counted_by(alloc_words);
+};
+
+/*
+ * Number of u64 words of bits[] storage that covers @nr_cids regardless of base
+ * alignment. The +1 absorbs up to 63 bits of head padding when base is not
+ * 64-aligned - always allocating one extra word beats branching on base or
+ * splitting the compute. The u64 cast keeps the +63 from wrapping when @nr_cids
+ * is near U32_MAX, so callers bounds-checking the result against @alloc_words
+ * catch the overflow instead of seeing a small value.
+ */
+#define SCX_CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1))
+
+/**
+ * __SCX_CMASK_DEFINE - Define an on-stack cmask with explicit storage capacity
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length
+ * @ALLOC_CIDS: storage capacity in cids, at least @NR_CIDS
+ *
+ * @NAME aliases zero-initialized storage with the active range set to
+ * [BASE, BASE + NR_CIDS). Use scx_cmask_reframe() to reshape later, up to
+ * @ALLOC_CIDS.
+ */
+#define __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, ALLOC_CIDS) \
+ _DEFINE_FLEX(struct scx_cmask, NAME, bits, SCX_CMASK_NR_WORDS(ALLOC_CIDS), \
+ = { .base = (BASE), \
+ .nr_cids = (NR_CIDS), \
+ .alloc_words = SCX_CMASK_NR_WORDS(ALLOC_CIDS) })
+
+/**
+ * SCX_CMASK_DEFINE - Define an on-stack cmask on tight storage
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, also storage capacity
+ *
+ * @NAME aliases zero-initialized storage with the active range and storage
+ * both [BASE, BASE + NR_CIDS).
+ */
+#define SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS) \
+ __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, NR_CIDS)
+
+/**
+ * SCX_CMASK_DEFINE_SHARD - Define an on-stack cmask sized to one shard
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, must be <= SCX_CID_SHARD_MAX_CPUS
+ *
+ * Storage is fixed at SCX_CID_SHARD_MAX_CPUS, active range framed by
+ * (BASE, NR_CIDS). Passing NR_CIDS > SCX_CID_SHARD_MAX_CPUS leaves the
+ * cmask claiming more bits than storage holds and subsequent cmask
+ * operations will overrun.
+ */
+#define SCX_CMASK_DEFINE_SHARD(NAME, BASE, NR_CIDS) \
+ __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, SCX_CID_SHARD_MAX_CPUS)
+
+#endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 728965851842..d78467ec6ee1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -334,7 +334,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
* to a tree or when we reach the top of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ tg_cfs_rq(cfs_rq->tg->parent, cpu)->on_list) {
/*
* If parent is already on the list, we add the child
* just before. Thanks to circular linked property of
@@ -342,7 +342,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
* of the list that starts by parent.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ &(tg_cfs_rq(cfs_rq->tg->parent, cpu)->leaf_cfs_rq_list));
/*
* The branch is now connected to its tree so we can
* reset tmp_alone_branch to the beginning of the
@@ -525,7 +525,7 @@ static int se_is_idle(struct sched_entity *se)
#endif /* !CONFIG_FAIR_GROUP_SCHED */
static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -882,11 +882,11 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* lag_i >= 0 -> V >= v_i
*
- * \Sum (v_i - v)*w_i
- * V = ------------------ + v
+ * \Sum (v_i - v0)*w_i
+ * V = ------------------- + v0
* \Sum w_i
*
- * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ * lag_i >= 0 -> \Sum (v_i - v0)*w_i >= (v_i - v0)*(\Sum w_i)
*
* Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
* to the loss in precision caused by the division.
@@ -894,7 +894,7 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->sum_w_vruntime;
+ s64 key, avg = cfs_rq->sum_w_vruntime;
long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
@@ -904,7 +904,36 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
+ key = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
+
+ /*
+ * The worst case term for @key includes 'NSEC_TICK * NICE_0_LOAD'
+ * and @load obviously includes NICE_0_LOAD. NSEC_TICK is around 24
+ * bits, while NICE_0_LOAD is 20 on 64bit and 10 otherwise.
+ *
+ * This gives that on 64bit the product will be at least 64bit which
+ * overflows s64, while on 32bit it will only be 44bits and should fit
+ * comfortably.
+ */
+#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+ /* This often results in simpler code than __builtin_mul_overflow(). */
+ return avg >= (__int128)key * load;
+#else
+ s64 rhs;
+ /*
+ * On overflow, the sign of key tells us the correct answer: a large
+ * positive key means vruntime >> V, so not eligible; a large negative
+ * key means vruntime << V, so eligible.
+ */
+ if (check_mul_overflow(key, load, &rhs))
+ return key <= 0;
+
+ return avg >= rhs;
+#endif
+#else /* 32bit */
+ return avg >= key * load;
+#endif
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1321,6 +1350,8 @@ void post_init_entity_util_avg(struct task_struct *p)
sa->runnable_avg = sa->util_avg;
}
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
+
static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
@@ -1343,6 +1374,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
trace_sched_stat_runtime(running, delta_exec);
account_group_exec_runtime(running, delta_exec);
+ account_mm_sched(rq, running, delta_exec);
/* cgroup time is always accounted against the donor */
cgroup_account_cputime(donor, delta_exec);
@@ -1364,6 +1396,581 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
static void set_next_buddy(struct sched_entity *se);
+#ifdef CONFIG_SCHED_CACHE
+
+/*
+ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
+ * tunable or so.
+ */
+#define EPOCH_PERIOD (HZ / 100) /* 10 ms */
+#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */
+__read_mostly unsigned int llc_aggr_tolerance = 1;
+__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
+__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_imb_pct = 20;
+__read_mostly unsigned int llc_overaggr_pct = 50;
+
+static int llc_id(int cpu)
+{
+ if (cpu < 0)
+ return -1;
+
+ return per_cpu(sd_llc_id, cpu);
+}
+
+static inline int get_sched_cache_scale(int mul)
+{
+ unsigned int tol = READ_ONCE(llc_aggr_tolerance);
+
+ if (!tol)
+ return 0;
+
+ if (tol >= 100)
+ return INT_MAX;
+
+ return (1 + (tol - 1) * mul);
+}
+
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned long llc, footprint;
+ struct sched_domain *sd;
+ int scale;
+
+ guard(rcu)();
+
+ sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd);
+ if (!sd)
+ return true;
+
+ if (static_branch_likely(&sched_numa_balancing)) {
+ /*
+ * TBD: RDT exclusive LLC ways reserved should be
+ * excluded.
+ */
+ llc = sd->llc_bytes;
+ footprint = READ_ONCE(mm->sc_stat.footprint);
+
+ /*
+ * Scale the LLC size by 256*llc_aggr_tolerance
+ * and compare it to the task's footprint.
+ *
+ * Suppose the L3 size is 32MB. If the
+ * llc_aggr_tolerance is 1:
+ * When the footprint is larger than 32MB, the
+ * process is regarded as exceeding the LLC
+ * capacity. If the llc_aggr_tolerance is 99:
+ * When the footprint is larger than 784GB, the
+ * process is regarded as exceeding the LLC
+ * capacity:
+ * 784GB = (1 + (99 - 1) * 256) * 32MB
+ * If the llc_aggr_tolerance is 100:
+ * ignore the footprint and do the aggregation
+ * anyway.
+ */
+ scale = get_sched_cache_scale(256);
+ if (scale == INT_MAX)
+ return false;
+
+ return ((llc * (u64)scale) < (footprint * PAGE_SIZE));
+ }
+#endif
+ return false;
+}
+
+static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
+ int cpu)
+{
+ int scale;
+
+ if (get_nr_threads(p) <= 1)
+ return true;
+
+ /*
+ * Scale the number of 'cores' in a LLC by llc_aggr_tolerance
+ * and compare it to the task's active threads.
+ */
+ scale = get_sched_cache_scale(1);
+ if (scale == INT_MAX)
+ return false;
+
+ return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
+ (scale * per_cpu(sd_llc_size, cpu)));
+}
+
+static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+{
+ int pref_llc, pref_llc_queued;
+ struct sched_domain *sd;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ pref_llc_queued = (pref_llc == task_llc(p));
+ rq->nr_llc_running++;
+ rq->nr_pref_llc_running += pref_llc_queued;
+
+ /*
+ * Record whether p is enqueued on its preferred
+ * LLC, in order to pair with account_llc_dequeue()
+ * to maintain a consistent nr_pref_llc_running per
+ * runqueue.
+ * This is necessary because a race condition exists:
+ * after a task is enqueued on a runqueue, task_llc(p)
+ * may change due to CPU hotplug. Therefore, checking
+ * task_llc(p) to determine whether the task is being
+ * dequeued from its preferred LLC is unreliable and
+ * can cause inconsistent values - checking the
+ * p->pref_llc_queued in account_llc_dequeue() would
+ * be reliable.
+ */
+ p->pref_llc_queued = pref_llc_queued;
+
+ sd = rcu_dereference_all(rq->sd);
+ if (sd && (unsigned int)pref_llc < sd->llc_max)
+ sd->llc_counts[pref_llc]++;
+}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+{
+ struct sched_domain *sd;
+ int pref_llc;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ rq->nr_llc_running--;
+ if (p->pref_llc_queued) {
+ rq->nr_pref_llc_running--;
+ /*
+ * Update the status in case
+ * other logic might query
+ * this.
+ */
+ p->pref_llc_queued = 0;
+ }
+
+ sd = rcu_dereference_all(rq->sd);
+ if (sd && (unsigned int)pref_llc < sd->llc_max) {
+ /*
+ * There is a race condition between dequeue
+ * and CPU hotplug. After a task has been enqueued
+ * on CPUx, a CPU hotplug event occurs, and all online
+ * CPUs (including CPUx) rebuild their sched_domains
+ * and reset statistics to zero(including sd->llc_counts).
+ * This can cause temporary undercount and we have to
+ * check for such underflow in sd->llc_counts.
+ *
+ * This undercount is temporary and accurate accounting
+ * will resume once the rq has a chance to be idle.
+ */
+ if (sd->llc_counts[pref_llc])
+ sd->llc_counts[pref_llc]--;
+ }
+}
+
+void mm_init_sched(struct mm_struct *mm,
+ struct sched_cache_time __percpu *_pcpu_sched)
+{
+ unsigned long epoch = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+ struct rq *rq = cpu_rq(i);
+
+ pcpu_sched->runtime = 0;
+ /* a slightly stale cpu epoch is acceptible */
+ pcpu_sched->epoch = rq->cpu_epoch;
+ epoch = rq->cpu_epoch;
+ }
+
+ raw_spin_lock_init(&mm->sc_stat.lock);
+ mm->sc_stat.epoch = epoch;
+ mm->sc_stat.cpu = -1;
+ mm->sc_stat.next_scan = jiffies;
+ mm->sc_stat.nr_running_avg = 0;
+ mm->sc_stat.footprint = 0;
+ /*
+ * The update to mm->sc_stat should not be reordered
+ * before initialization to mm's other fields, in case
+ * the readers may get invalid mm_sched_epoch, etc.
+ */
+ smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched);
+}
+
+/* because why would C be fully specified */
+static __always_inline void __shr_u64(u64 *val, unsigned int n)
+{
+ if (n >= 64) {
+ *val = 0;
+ return;
+ }
+ *val >>= n;
+}
+
+static inline void __update_mm_sched(struct rq *rq,
+ struct sched_cache_time *pcpu_sched)
+{
+ lockdep_assert_held(&rq->cpu_epoch_lock);
+
+ unsigned int period = max(READ_ONCE(llc_epoch_period), 1U);
+ unsigned long n, now = jiffies;
+ long delta = now - rq->cpu_epoch_next;
+
+ if (delta > 0) {
+ n = (delta + period - 1) / period;
+ rq->cpu_epoch += n;
+ rq->cpu_epoch_next += n * period;
+ __shr_u64(&rq->cpu_runtime, n);
+ }
+
+ n = rq->cpu_epoch - pcpu_sched->epoch;
+ if (n) {
+ pcpu_sched->epoch += n;
+ __shr_u64(&pcpu_sched->runtime, n);
+ }
+}
+
+static unsigned long fraction_mm_sched(struct rq *rq,
+ struct sched_cache_time *pcpu_sched)
+{
+ guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+
+ __update_mm_sched(rq, pcpu_sched);
+
+ /*
+ * Runtime is a geometric series (r=0.5) and as such will sum to twice
+ * the accumulation period, this means the multiplcation here should
+ * not overflow.
+ */
+ return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+}
+
+static int get_pref_llc(struct task_struct *p, struct mm_struct *mm)
+{
+ int mm_sched_llc = -1, mm_sched_cpu;
+
+ if (!mm)
+ return -1;
+
+ mm_sched_cpu = READ_ONCE(mm->sc_stat.cpu);
+ if (mm_sched_cpu != -1) {
+ mm_sched_llc = llc_id(mm_sched_cpu);
+
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Don't assign preferred LLC if it
+ * conflicts with NUMA balancing.
+ * This can happen when sched_setnuma() gets
+ * called, however it is not much of an issue
+ * because we expect account_mm_sched() to get
+ * called fairly regularly -- at a higher rate
+ * than sched_setnuma() at least -- and thus the
+ * conflict only exists for a short period of time.
+ */
+ if (static_branch_likely(&sched_numa_balancing) &&
+ p->numa_preferred_nid >= 0 &&
+ cpu_to_node(mm_sched_cpu) != p->numa_preferred_nid)
+ mm_sched_llc = -1;
+#endif
+ }
+
+ return mm_sched_llc;
+}
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+ struct sched_cache_time *pcpu_sched;
+ struct mm_struct *mm = p->mm;
+ int mm_sched_llc = -1;
+ unsigned long epoch;
+
+ if (!sched_cache_enabled())
+ return;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+ /*
+ * init_task, kthreads and user thread created
+ * by user_mode_thread() don't have mm.
+ */
+ if (!mm || !mm->sc_stat.pcpu_sched)
+ return;
+
+ pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu_of(rq));
+
+ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+ __update_mm_sched(rq, pcpu_sched);
+ pcpu_sched->runtime += delta_exec;
+ rq->cpu_runtime += delta_exec;
+ epoch = rq->cpu_epoch;
+ }
+
+ /*
+ * If this process hasn't hit task_cache_work() for a while invalidate
+ * its preferred state.
+ */
+ if ((long)(epoch - READ_ONCE(mm->sc_stat.epoch)) > llc_epoch_affinity_timeout ||
+ invalid_llc_nr(mm, p, cpu_of(rq)) ||
+ exceed_llc_capacity(mm, cpu_of(rq))) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+ }
+
+ mm_sched_llc = get_pref_llc(p, mm);
+
+ /* task not on rq accounted later in account_entity_enqueue() */
+ if (task_running_on_cpu(rq->cpu, p) &&
+ READ_ONCE(p->preferred_llc) != mm_sched_llc) {
+ account_llc_dequeue(rq, p);
+ WRITE_ONCE(p->preferred_llc, mm_sched_llc);
+ account_llc_enqueue(rq, p);
+ }
+}
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p)
+{
+ struct callback_head *work = &p->cache_work;
+ struct mm_struct *mm = p->mm;
+ unsigned long epoch;
+
+ if (!sched_cache_enabled())
+ return;
+
+ if (!mm || p->flags & PF_KTHREAD ||
+ !mm->sc_stat.pcpu_sched)
+ return;
+
+ epoch = rq->cpu_epoch;
+ /* avoid moving backwards */
+ if (time_after_eq(mm->sc_stat.epoch, epoch))
+ return;
+
+ guard(raw_spinlock)(&mm->sc_stat.lock);
+
+ if (work->next == work) {
+ task_work_add(p, work, TWA_RESUME);
+ WRITE_ONCE(mm->sc_stat.epoch, epoch);
+ }
+}
+
+static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ int cpu, curr_cpu, nid, pref_nid;
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ goto out;
+
+ cpu = READ_ONCE(p->mm->sc_stat.cpu);
+ if (cpu != -1)
+ nid = cpu_to_node(cpu);
+ curr_cpu = task_cpu(p);
+
+ /*
+ * Scanning in the preferred NUMA node is ideal. However, the NUMA
+ * preferred node is per-task rather than per-process. It is possible
+ * for different threads of the process to have distinct preferred
+ * nodes; consequently, the process-wide preferred LLC may bounce
+ * between different nodes. As a workaround, maintain the scan
+ * CPU mask to also cover the process's current preferred LLC and the
+ * current running node to mitigate the bouncing risk.
+ * TBD: numa_group should be considered during task aggregation.
+ */
+ pref_nid = p->numa_preferred_nid;
+ /* honor the task's preferred node */
+ if (pref_nid == NUMA_NO_NODE)
+ goto out;
+
+ cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
+
+ /* honor the task's preferred LLC CPU */
+ if (cpu != -1 && !cpumask_test_cpu(cpu, cpus) && nid != NUMA_NO_NODE)
+ cpumask_or(cpus, cpus, cpumask_of_node(nid));
+
+ /* make sure the task's current running node is included */
+ if (!cpumask_test_cpu(curr_cpu, cpus))
+ cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
+
+ return;
+
+out:
+#endif
+ cpumask_copy(cpus, cpu_online_mask);
+}
+
+static inline void update_avg_scale(u64 *avg, u64 sample)
+{
+ int factor = per_cpu(sd_llc_size, raw_smp_processor_id());
+ s64 diff = sample - *avg;
+ u32 divisor;
+
+ /*
+ * Scale the divisor based on the number of CPUs contained
+ * in the LLC. This scaling ensures smaller LLC domains use
+ * a smaller divisor to achieve more precise sensitivity to
+ * changes in nr_running, while larger LLC domains are capped
+ * at a maximum divisor of 8 which is the default smoothing
+ * factor of EWMA in update_avg().
+ */
+ divisor = clamp_t(u32, (factor >> 2), 2, 8);
+ *avg += div64_s64(diff, divisor);
+}
+
+static void task_cache_work(struct callback_head *work)
+{
+ int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
+ unsigned long next_scan, now = jiffies;
+ struct task_struct *p = current, *cur;
+ unsigned long curr_m_a_occ = 0;
+ struct mm_struct *mm = p->mm;
+ unsigned long m_a_occ = 0;
+ cpumask_var_t cpus;
+
+ WARN_ON_ONCE(work != &p->cache_work);
+
+ work->next = work;
+
+ if (p->flags & PF_EXITING)
+ return;
+
+ next_scan = READ_ONCE(mm->sc_stat.next_scan);
+ if (time_before(now, next_scan))
+ return;
+
+ /* only 1 thread is allowed to scan */
+ if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan,
+ now + max_t(unsigned long,
+ READ_ONCE(llc_epoch_period), 1)))
+ return;
+
+ curr_cpu = task_cpu(p);
+ if (invalid_llc_nr(mm, p, curr_cpu) ||
+ exceed_llc_capacity(mm, curr_cpu)) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+
+ return;
+ }
+
+ if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ return;
+
+ scoped_guard (cpus_read_lock) {
+ guard(rcu)();
+
+ get_scan_cpumasks(cpus, p);
+
+ for_each_cpu(cpu, cpus) {
+ /* XXX sched_cluster_active */
+ struct sched_domain *sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
+ unsigned long occ, m_occ = 0, a_occ = 0;
+ int m_cpu = -1, i;
+
+ if (!sd)
+ continue;
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ occ = fraction_mm_sched(cpu_rq(i),
+ per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ a_occ += occ;
+ if (occ > m_occ) {
+ m_occ = occ;
+ m_cpu = i;
+ }
+
+ cur = rcu_dereference_all(cpu_rq(i)->curr);
+ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
+ cur->mm == mm)
+ nr_running++;
+ }
+
+ /*
+ * Compare the accumulated occupancy of each LLC. The
+ * reason for using accumulated occupancy rather than average
+ * per CPU occupancy is that it works better in asymmetric LLC
+ * scenarios.
+ * For example, if there are 2 threads in a 4CPU LLC and 3
+ * threads in an 8CPU LLC, it might be better to choose the one
+ * with 3 threads. However, this would not be the case if the
+ * occupancy is divided by the number of CPUs in an LLC (i.e.,
+ * if average per CPU occupancy is used).
+ * Besides, NUMA balancing fault statistics behave similarly:
+ * the total number of faults per node is compared rather than
+ * the average number of faults per CPU. This strategy is also
+ * followed here.
+ */
+ if (a_occ > m_a_occ) {
+ m_a_occ = a_occ;
+ m_a_cpu = m_cpu;
+ }
+
+ if (llc_id(cpu) == llc_id(READ_ONCE(mm->sc_stat.cpu)))
+ curr_m_a_occ = a_occ;
+
+ cpumask_andnot(cpus, cpus, sched_domain_span(sd));
+ }
+ }
+
+ if (m_a_occ > (2 * curr_m_a_occ)) {
+ /*
+ * Avoid switching sc_stat.cpu too fast.
+ * The reason to choose 2X is because:
+ * 1. It is better to keep the preferred LLC stable,
+ * rather than changing it frequently and cause migrations
+ * 2. 2X means the new preferred LLC has at least 1 more
+ * busy CPU than the old one(200% vs 100%, eg)
+ * 3. 2X is chosen based on test results, as it delivers
+ * the optimal performance gain so far.
+ */
+ WRITE_ONCE(mm->sc_stat.cpu, m_a_cpu);
+ }
+
+ update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
+ free_cpumask_var(cpus);
+}
+
+void init_sched_mm(struct task_struct *p)
+{
+ struct callback_head *work = &p->cache_work;
+
+ init_task_work(work, task_cache_work);
+ work->next = work;
+ /*
+ * Reset new task's preference to avoid
+ * polluting account_llc_enqueue().
+ */
+ p->preferred_llc = -1;
+}
+
+#else /* CONFIG_SCHED_CACHE */
+
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+ s64 delta_exec) { }
+
+void init_sched_mm(struct task_struct *p) { }
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+
+static inline int get_pref_llc(struct task_struct *p,
+ struct mm_struct *mm)
+{
+ return -1;
+}
+
+static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
+
+#endif /* CONFIG_SCHED_CACHE */
+
/*
* Used by other classes to account runtime.
*/
@@ -1549,13 +2156,9 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
-/**************************************************
- * Scheduling class queueing methods:
- */
-
+/* Check sched_smt_active before calling this to avoid overheads in fastpaths */
static inline bool is_core_idle(int cpu)
{
-#ifdef CONFIG_SCHED_SMT
int sibling;
for_each_cpu(sibling, cpu_smt_mask(cpu)) {
@@ -1565,7 +2168,6 @@ static inline bool is_core_idle(int cpu)
if (!idle_cpu(sibling))
return false;
}
-#endif
return true;
}
@@ -2248,12 +2850,11 @@ numa_type numa_classify(unsigned int imbalance_pct,
return node_fully_busy;
}
-#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
static inline bool test_idle_cores(int cpu);
static inline int numa_idle_core(int idle_core, int cpu)
{
- if (!static_branch_likely(&sched_smt_present) ||
+ if (!sched_smt_active() ||
idle_core >= 0 || !test_idle_cores(cpu))
return idle_core;
@@ -2266,12 +2867,6 @@ static inline int numa_idle_core(int idle_core, int cpu)
return idle_core;
}
-#else /* !CONFIG_SCHED_SMT: */
-static inline int numa_idle_core(int idle_core, int cpu)
-{
- return idle_core;
-}
-#endif /* !CONFIG_SCHED_SMT */
/*
* Gather all necessary information to make NUMA balancing placement
@@ -3050,6 +3645,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ long __maybe_unused new_fp;
struct numa_group *ng;
/*
@@ -3124,6 +3720,31 @@ static void task_numa_placement(struct task_struct *p)
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
+#ifdef CONFIG_SCHED_CACHE
+ /*
+ * Per task p->numa_faults[mem_idx] converges,
+ * so the accumulation of each task's faults
+ * converges too - Given the number of threads,
+ * it cannot overflow an unsigned long.
+ * Racy with concurrent updates from other threads
+ * sharing this mm. Acceptable since footprint is a
+ * heuristic and occasional lost updates are tolerable.
+ *
+ * If a task exits, its corresponding footprint must
+ * be subtracted from the mm->sc_stat.footprint, otherwise
+ * the mm->sc_stat.footprint will not converge:
+ * the exiting thread's footprint remains unchanged/undecayed
+ * in mm->sc_stat.footprint. See exit_mm().
+ *
+ * Lost updates and unsynchronized subtraction
+ * in exit_mm() can cause footprint + diff to
+ * go negative. Clamp to zero to prevent the
+ * unsigned footprint from wrapping.
+ */
+ new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff;
+ WRITE_ONCE(p->mm->sc_stat.footprint,
+ max(new_fp, 0L));
+#endif
}
if (!ng) {
@@ -3848,9 +4469,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
struct rq *rq = rq_of(cfs_rq);
- account_numa_enqueue(rq, task_of(se));
+ account_numa_enqueue(rq, p);
+ account_llc_enqueue(rq, p);
list_add(&se->group_node, &rq->cfs_tasks);
}
cfs_rq->nr_queued++;
@@ -3861,7 +4484,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
- account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ struct task_struct *p = task_of(se);
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_dequeue(rq, p);
+ account_llc_dequeue(rq, p);
list_del_init(&se->group_node);
}
cfs_rq->nr_queued--;
@@ -4364,7 +4991,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
* For migration heavy workloads, access to tg->load_avg can be
* unbound. Limit the update rate to at most once per ms.
*/
- now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ now = rq_clock(rq_of(cfs_rq));
if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
return;
@@ -4387,7 +5014,7 @@ static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
if (cfs_rq->tg == &root_task_group)
return;
- now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ now = rq_clock(rq_of(cfs_rq));
delta = 0 - cfs_rq->tg_load_avg_contrib;
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = 0;
@@ -4408,13 +5035,13 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
*/
rq_clock_start_loop_update(rq);
- rcu_read_lock();
+ guard(rcu)();
+
list_for_each_entry_rcu(tg, &task_groups, list) {
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
clear_tg_load_avg(cfs_rq);
}
- rcu_read_unlock();
rq_clock_stop_loop_update(rq);
}
@@ -4930,13 +5557,86 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
trace_pelt_cfs_tp(cfs_rq);
}
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
+static inline void util_est_update(struct sched_entity *se)
+{
+ unsigned int ewma, dequeued, last_ewma_diff;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Get current estimate of utilization */
+ ewma = READ_ONCE(se->avg.util_est);
+
+ /*
+ * If the PELT values haven't changed since enqueue time,
+ * skip the util_est update.
+ */
+ if (ewma & UTIL_AVG_UNCHANGED)
+ return;
+
+ /* Get utilization at dequeue */
+ dequeued = READ_ONCE(se->avg.util_avg);
+
+ /*
+ * Reset EWMA on utilization increases, the moving average is used only
+ * to smooth utilization decreases.
+ */
+ if (ewma <= dequeued) {
+ ewma = dequeued;
+ goto done;
+ }
+
+ /*
+ * Skip update of task's estimated utilization when its members are
+ * already ~1% close to its last activation value.
+ */
+ last_ewma_diff = ewma - dequeued;
+ if (last_ewma_diff < UTIL_EST_MARGIN)
+ goto done;
+
+ /*
+ * To avoid underestimate of task utilization, skip updates of EWMA if
+ * we cannot grant that thread got all CPU time it wanted.
+ */
+ if ((dequeued + UTIL_EST_MARGIN) < READ_ONCE(se->avg.runnable_avg))
+ goto done;
+
+ /*
+ * Update Task's estimated utilization
+ *
+ * When *p completes an activation we can consolidate another sample
+ * of the task size. This is done by using this value to update the
+ * Exponential Weighted Moving Average (EWMA):
+ *
+ * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
+ * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
+ * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
+ * = w * ( -last_ewma_diff ) + ewma(t-1)
+ * = w * (-last_ewma_diff + ewma(t-1) / w)
+ *
+ * Where 'w' is the weight of new samples, which is configured to be
+ * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+ */
+ ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ewma -= last_ewma_diff;
+ ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
+ ewma |= UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(se->avg.util_est, ewma);
+
+ trace_sched_util_est_se_tp(se);
+}
+
/*
* Optional action to be done while updating the load average
*/
-#define UPDATE_TG 0x1
-#define SKIP_AGE_LOAD 0x2
-#define DO_ATTACH 0x4
-#define DO_DETACH 0x8
+#define UPDATE_TG 0x01
+#define SKIP_AGE_LOAD 0x02
+#define DO_ATTACH 0x04
+#define DO_DETACH 0x08
+#define UPDATE_UTIL_EST 0x10
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -4979,6 +5679,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq);
}
+
+ if (flags & UPDATE_UTIL_EST)
+ util_est_update(se);
}
/*
@@ -5037,11 +5740,6 @@ static inline unsigned long task_util(struct task_struct *p)
return READ_ONCE(p->se.avg.util_avg);
}
-static inline unsigned long task_runnable(struct task_struct *p)
-{
- return READ_ONCE(p->se.avg.runnable_avg);
-}
-
static inline unsigned long _task_util_est(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
@@ -5084,88 +5782,6 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
trace_sched_util_est_cfs_tp(cfs_rq);
}
-#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
-
-static inline void util_est_update(struct cfs_rq *cfs_rq,
- struct task_struct *p,
- bool task_sleep)
-{
- unsigned int ewma, dequeued, last_ewma_diff;
-
- if (!sched_feat(UTIL_EST))
- return;
-
- /*
- * Skip update of task's estimated utilization when the task has not
- * yet completed an activation, e.g. being migrated.
- */
- if (!task_sleep)
- return;
-
- /* Get current estimate of utilization */
- ewma = READ_ONCE(p->se.avg.util_est);
-
- /*
- * If the PELT values haven't changed since enqueue time,
- * skip the util_est update.
- */
- if (ewma & UTIL_AVG_UNCHANGED)
- return;
-
- /* Get utilization at dequeue */
- dequeued = task_util(p);
-
- /*
- * Reset EWMA on utilization increases, the moving average is used only
- * to smooth utilization decreases.
- */
- if (ewma <= dequeued) {
- ewma = dequeued;
- goto done;
- }
-
- /*
- * Skip update of task's estimated utilization when its members are
- * already ~1% close to its last activation value.
- */
- last_ewma_diff = ewma - dequeued;
- if (last_ewma_diff < UTIL_EST_MARGIN)
- goto done;
-
- /*
- * To avoid underestimate of task utilization, skip updates of EWMA if
- * we cannot grant that thread got all CPU time it wanted.
- */
- if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
- goto done;
-
-
- /*
- * Update Task's estimated utilization
- *
- * When *p completes an activation we can consolidate another sample
- * of the task size. This is done by using this value to update the
- * Exponential Weighted Moving Average (EWMA):
- *
- * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
- * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
- * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
- * = w * ( -last_ewma_diff ) + ewma(t-1)
- * = w * (-last_ewma_diff + ewma(t-1) / w)
- *
- * Where 'w' is the weight of new samples, which is configured to be
- * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
- */
- ewma <<= UTIL_EST_WEIGHT_SHIFT;
- ewma -= last_ewma_diff;
- ewma >>= UTIL_EST_WEIGHT_SHIFT;
-done:
- ewma |= UTIL_AVG_UNCHANGED;
- WRITE_ONCE(p->se.avg.util_est, ewma);
-
- trace_sched_util_est_se_tp(&p->se);
-}
-
static inline unsigned long get_actual_cpu_capacity(int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(cpu);
@@ -5618,7 +6234,7 @@ static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool sleep = flags & DEQUEUE_SLEEP;
- int action = UPDATE_TG;
+ int action = 0;
update_curr(cfs_rq);
clear_buddies(cfs_rq, se);
@@ -5638,15 +6254,23 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
- update_load_avg(cfs_rq, se, 0);
+ if (entity_is_task(se))
+ action |= UPDATE_UTIL_EST;
+ update_load_avg(cfs_rq, se, action);
update_entity_lag(cfs_rq, se);
set_delayed(se);
return false;
}
}
- if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
- action |= DO_DETACH;
+ action = UPDATE_TG;
+ if (entity_is_task(se)) {
+ if (task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
+
+ if (sleep && !(flags & DEQUEUE_DELAYED))
+ action |= UPDATE_UTIL_EST;
+ }
/*
* When dequeuing a sched_entity, we must:
@@ -5764,8 +6388,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
return se;
}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
@@ -5775,9 +6397,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
if (prev->on_rq)
update_curr(cfs_rq);
- /* throttle cfs_rqs exceeding runtime */
- check_cfs_rq_runtime(cfs_rq);
-
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
@@ -5912,44 +6531,32 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
return cfs_rq->runtime_remaining > 0;
}
-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- int ret;
-
- raw_spin_lock(&cfs_b->lock);
- ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
- raw_spin_unlock(&cfs_b->lock);
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq);
- return ret;
-}
-
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
if (likely(cfs_rq->runtime_remaining > 0))
- return;
+ return false;
if (cfs_rq->throttled)
- return;
+ return true;
/*
- * if we're unable to extend our runtime we resched so that the active
- * hierarchy can be throttled
+ * throttle_cfs_rq() will try to extend the runtime first
+ * before throttling the hierarchy.
*/
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_curr(rq_of(cfs_rq));
+ return throttle_cfs_rq(cfs_rq);
}
static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
- return;
+ return false;
- __account_cfs_rq_runtime(cfs_rq, delta_exec);
+ return __account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -5970,7 +6577,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
{
- return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+ return throttled_hierarchy(tg_cfs_rq(task_group(p), dst_cpu));
}
static inline bool task_is_throttled(struct task_struct *p)
@@ -6116,8 +6723,18 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
struct task_struct *p, *tmp;
+ LIST_HEAD(throttled_tasks);
+
+ /*
+ * If cfs_rq->curr is set, the cfs_rq might not have caught up
+ * since the last clock update. Do it now before we begin
+ * queueing task onto it to save the need for unnecessarily
+ * unthrottle the hierarchy for this cfs_rq to be throttled
+ * right back again.
+ */
+ update_curr(cfs_rq);
if (--cfs_rq->throttle_count)
return 0;
@@ -6139,13 +6756,31 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttled_clock_self_time += delta;
}
+ /*
+ * Move the tasks to a local list since an update_curr() during
+ * enqueue_task_fair() can throttle a higher cfs_rq, and it can
+ * see the "throttled_limbo_list" being non-empty in
+ * tg_throttle_down() if throttle_count turned 0 above.
+ */
+ list_splice_init(&cfs_rq->throttled_limbo_list, &throttled_tasks);
+
/* Re-enqueue the tasks that have been throttled at this level. */
- list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_for_each_entry_safe(p, tmp, &throttled_tasks, throttle_node) {
+ /*
+ * Back to being throttled! Break out and put the remaining
+ * tasks back onto the limbo_list to prevent running them
+ * unnecessarily.
+ */
+ if (cfs_rq->throttle_count)
+ break;
+
list_del_init(&p->throttle_node);
p->throttled = false;
- enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+ enqueue_task_fair(rq, p, ENQUEUE_WAKEUP);
}
+ list_splice(&throttled_tasks, &cfs_rq->throttled_limbo_list);
+
/* Add cfs_rq with load or one or more already running entities to the list */
if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
@@ -6187,7 +6822,7 @@ static void record_throttle_clock(struct cfs_rq *cfs_rq)
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
if (cfs_rq->throttle_count++)
return 0;
@@ -6209,35 +6844,48 @@ static int tg_throttle_down(struct task_group *tg, void *data)
static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
- struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- int dequeue = 1;
+ struct sched_entity *curr = cfs_rq->curr;
+ struct rq *rq = rq_of(cfs_rq);
+
+ scoped_guard(raw_spinlock, &cfs_b->lock) {
+ u64 target_runtime = 1;
- raw_spin_lock(&cfs_b->lock);
- /* This will start the period timer if necessary */
- if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
/*
- * We have raced with bandwidth becoming available, and if we
- * actually throttled the timer might not unthrottle us for an
- * entire period. We additionally needed to make sure that any
- * subsequent check_cfs_rq_runtime calls agree not to throttle
- * us, as we may commit to do cfs put_prev+pick_next, so we ask
- * for 1ns of runtime rather than just check cfs_b.
+ * If cfs_rq->curr is still runnable, we are here from an
+ * update_curr(). Request sysctl_sched_cfs_bandwidth_slice
+ * worth of bandwidth to continue running.
+ *
+ * If the curr is not runnable, just request enough bandwidth
+ * to be runnable next time the pick selects this cfs_rq.
+ */
+ if (curr && curr->on_rq)
+ target_runtime = sched_cfs_bandwidth_slice();
+
+ /*
+ * Check if We have raced with bandwidth becoming available. If
+ * we actually throttled the timer might not unthrottle us for
+ * an entire period. We additionally needed to make sure that
+ * any subsequent check_cfs_rq_runtime calls agree not to
+ * throttle us, as we may commit to do cfs put_prev+pick_next,
+ * so we ask for 1ns of runtime rather than just check cfs_b.
+ *
+ * This will start the period timer if necessary.
+ */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, target_runtime))
+ return false;
+
+ /*
+ * No bandwidth available; Add ourselves on the list to be
+ * unthrottled later.
*/
- dequeue = 0;
- } else {
list_add_tail_rcu(&cfs_rq->throttled_list,
&cfs_b->throttled_cfs_rq);
}
- raw_spin_unlock(&cfs_b->lock);
-
- if (!dequeue)
- return false; /* Throttle no longer required. */
/* freeze hierarchy runnable averages while throttled */
- rcu_read_lock();
- walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
- rcu_read_unlock();
+ scoped_guard(rcu)
+ walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
/*
* Note: distribution will already see us throttled via the
@@ -6245,6 +6893,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
*/
cfs_rq->throttled = 1;
WARN_ON_ONCE(cfs_rq->throttled_clock);
+
+ /*
+ * If current hierarchy was throttled, add throttle work to the
+ * current donor. In case of proxy-execution, the execution
+ * context cannot exit to the userspace while holding a mutex
+ * and the rule of throttle deferral to only throttle the
+ * throttled context at exit to userspace is still preserved.
+ */
+ if (curr && curr->on_rq)
+ task_throttle_setup_work(rq->donor);
+
return true;
}
@@ -6252,7 +6911,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+ struct sched_entity *se = cfs_rq_se(cfs_rq);
/*
* It's possible we are called with runtime_remaining < 0 due to things
@@ -6262,21 +6921,25 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
* We can't unthrottle this cfs_rq without any runtime remaining because
* any enqueue in tg_unthrottle_up() will immediately trigger a throttle,
* which is not supposed to happen on unthrottle path.
+ *
+ * Catch up on the remaining runtime since last clock update before
+ * checking runtime remaining.
*/
+ update_curr(cfs_rq);
if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
return;
cfs_rq->throttled = 0;
- update_rq_clock(rq);
+ scoped_guard(raw_spinlock, &cfs_b->lock) {
+ list_del_rcu(&cfs_rq->throttled_list);
+
+ if (!cfs_rq->throttled_clock)
+ break;
- raw_spin_lock(&cfs_b->lock);
- if (cfs_rq->throttled_clock) {
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
cfs_rq->throttled_clock = 0;
}
- list_del_rcu(&cfs_rq->throttled_list);
- raw_spin_unlock(&cfs_b->lock);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
@@ -6305,9 +6968,8 @@ static void __cfsb_csd_unthrottle(void *arg)
{
struct cfs_rq *cursor, *tmp;
struct rq *rq = arg;
- struct rq_flags rf;
- rq_lock(rq, &rf);
+ guard(rq_lock)(rq);
/*
* Iterating over the list can trigger several call to
@@ -6324,7 +6986,7 @@ static void __cfsb_csd_unthrottle(void *arg)
* race with group being freed in the window between removing it
* from the list and advancing to the next entry in the list.
*/
- rcu_read_lock();
+ guard(rcu)();
list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
throttled_csd_list) {
@@ -6334,10 +6996,7 @@ static void __cfsb_csd_unthrottle(void *arg)
unthrottle_cfs_rq(cursor);
}
- rcu_read_unlock();
-
rq_clock_stop_loop_update(rq);
- rq_unlock(rq, &rf);
}
static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
@@ -6346,6 +7005,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
bool first;
if (rq == this_rq()) {
+ update_rq_clock(rq);
unthrottle_cfs_rq(cfs_rq);
return;
}
@@ -6373,15 +7033,14 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
+ bool throttled = false, unthrottle_local = false;
int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
- bool throttled = false;
- struct cfs_rq *cfs_rq, *tmp;
- struct rq_flags rf;
+ struct cfs_rq *cfs_rq;
struct rq *rq;
- LIST_HEAD(local_unthrottle);
- rcu_read_lock();
+ guard(rcu)();
+
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
rq = rq_of(cfs_rq);
@@ -6391,64 +7050,66 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
break;
}
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
+
if (!cfs_rq_throttled(cfs_rq))
- goto next;
+ continue;
/* Already queued for async unthrottle */
if (!list_empty(&cfs_rq->throttled_csd_list))
- goto next;
+ continue;
+
+ if (cfs_rq->curr) {
+ update_rq_clock(rq);
+ update_curr(cfs_rq);
+ }
/* By the above checks, this should never be true */
WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
- raw_spin_lock(&cfs_b->lock);
- runtime = -cfs_rq->runtime_remaining + 1;
- if (runtime > cfs_b->runtime)
- runtime = cfs_b->runtime;
- cfs_b->runtime -= runtime;
- remaining = cfs_b->runtime;
- raw_spin_unlock(&cfs_b->lock);
+ scoped_guard(raw_spinlock, &cfs_b->lock) {
+ runtime = -cfs_rq->runtime_remaining + 1;
+ if (runtime > cfs_b->runtime)
+ runtime = cfs_b->runtime;
+ cfs_b->runtime -= runtime;
+ remaining = cfs_b->runtime;
+ }
cfs_rq->runtime_remaining += runtime;
- /* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0) {
- if (cpu_of(rq) != this_cpu) {
- unthrottle_cfs_rq_async(cfs_rq);
- } else {
- /*
- * We currently only expect to be unthrottling
- * a single cfs_rq locally.
- */
- WARN_ON_ONCE(!list_empty(&local_unthrottle));
- list_add_tail(&cfs_rq->throttled_csd_list,
- &local_unthrottle);
- }
- } else {
+ /*
+ * Ran out of bandwidth during distribution!
+ * Indicate throttled entities and break early.
+ */
+ if (cfs_rq->runtime_remaining <= 0) {
throttled = true;
+ break;
}
-next:
- rq_unlock_irqrestore(rq, &rf);
- }
-
- list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
- throttled_csd_list) {
- struct rq *rq = rq_of(cfs_rq);
-
- rq_lock_irqsave(rq, &rf);
-
- list_del_init(&cfs_rq->throttled_csd_list);
-
- if (cfs_rq_throttled(cfs_rq))
- unthrottle_cfs_rq(cfs_rq);
+ /* we check whether we're throttled above */
+ if (cpu_of(rq) != this_cpu) {
+ unthrottle_cfs_rq_async(cfs_rq);
+ continue;
+ }
- rq_unlock_irqrestore(rq, &rf);
+ /*
+ * Allow a parallel async unthrottle to unthrottle
+ * this cfs_rq too via __cfsb_csd_unthrottle().
+ * If we are first, do it ourselves at the end and
+ * save on an IPI from remote CPUs.
+ */
+ unthrottle_local = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
}
- WARN_ON_ONCE(!list_empty(&local_unthrottle));
- rcu_read_unlock();
+ if (unthrottle_local) {
+ /*
+ * Protect against an IPI that is also trying to flush
+ * the unthrottled cfs_rq(s) from this CPU's csd_list.
+ */
+ scoped_guard(irqsave)
+ __cfsb_csd_unthrottle(cpu_rq(this_cpu));
+ }
return throttled;
}
@@ -6572,7 +7233,8 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (slack_runtime <= 0)
return;
- raw_spin_lock(&cfs_b->lock);
+ guard(raw_spinlock)(&cfs_b->lock);
+
if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
@@ -6581,7 +7243,6 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
!list_empty(&cfs_b->throttled_cfs_rq))
start_cfs_slack_bandwidth(cfs_b);
}
- raw_spin_unlock(&cfs_b->lock);
/* even if it's not valid for return we don't want to try again */
cfs_rq->runtime_remaining -= slack_runtime;
@@ -6604,25 +7265,21 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
*/
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
- u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
- unsigned long flags;
-
/* confirm we're still not at a refresh boundary */
- raw_spin_lock_irqsave(&cfs_b->lock, flags);
- cfs_b->slack_started = false;
+ scoped_guard(raw_spinlock_irqsave, &cfs_b->lock) {
+ u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
- return;
- }
+ cfs_b->slack_started = false;
- if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
- runtime = cfs_b->runtime;
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ return;
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
+ runtime = cfs_b->runtime;
- if (!runtime)
- return;
+ if (!runtime)
+ return;
+ }
distribute_cfs_runtime(cfs_b);
}
@@ -6637,7 +7294,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- /* an active group must be handled by the update_curr()->put() path */
+ /* an active group must be handled by the update_curr() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -6647,8 +7304,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
/* update runtime allocation */
account_cfs_rq_runtime(cfs_rq, 0);
- if (cfs_rq->runtime_remaining <= 0)
- throttle_cfs_rq(cfs_rq);
}
static void sync_throttle(struct task_group *tg, int cpu)
@@ -6661,8 +7316,8 @@ static void sync_throttle(struct task_group *tg, int cpu)
if (!tg->parent)
return;
- cfs_rq = tg->cfs_rq[cpu];
- pcfs_rq = tg->parent->cfs_rq[cpu];
+ cfs_rq = tg_cfs_rq(tg, cpu);
+ pcfs_rq = tg_cfs_rq(tg->parent, cpu);
cfs_rq->throttle_count = pcfs_rq->throttle_count;
cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
@@ -6678,25 +7333,6 @@ static void sync_throttle(struct task_group *tg, int cpu)
cfs_rq->pelt_clock_throttled = 1;
}
-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- if (!cfs_bandwidth_used())
- return false;
-
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return false;
-
- /*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
- */
- if (cfs_rq_throttled(cfs_rq))
- return true;
-
- return throttle_cfs_rq(cfs_rq);
-}
-
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -6711,18 +7347,18 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
- unsigned long flags;
int overrun;
int idle = 0;
int count = 0;
- raw_spin_lock_irqsave(&cfs_b->lock, flags);
+ CLASS(raw_spinlock_irqsave, cfsb_guard)(&cfs_b->lock);
+
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, cfsb_guard.flags);
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
@@ -6755,11 +7391,13 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
count = 0;
}
}
- if (idle)
+
+ if (idle) {
cfs_b->period_active = 0;
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
+ return HRTIMER_NORESTART;
+ }
- return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+ return HRTIMER_RESTART;
}
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
@@ -6826,14 +7464,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
*/
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
- unsigned long flags;
if (list_empty(&rq->cfsb_csd_list))
continue;
- local_irq_save(flags);
- __cfsb_csd_unthrottle(rq);
- local_irq_restore(flags);
+ scoped_guard(irqsave)
+ __cfsb_csd_unthrottle(rq);
}
}
@@ -6851,16 +7487,15 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
lockdep_assert_rq_held(rq);
- rcu_read_lock();
+ guard(rcu)();
+
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
- raw_spin_lock(&cfs_b->lock);
- cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
- raw_spin_unlock(&cfs_b->lock);
+ scoped_guard(raw_spinlock, &cfs_b->lock)
+ cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
}
- rcu_read_unlock();
}
/* cpu offline callback */
@@ -6881,9 +7516,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
*/
rq_clock_start_loop_update(rq);
- rcu_read_lock();
+ guard(rcu)();
+
list_for_each_entry_rcu(tg, &task_groups, list) {
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
if (!cfs_rq->runtime_enabled)
continue;
@@ -6904,7 +7540,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
cfs_rq->runtime_remaining = 1;
unthrottle_cfs_rq(cfs_rq);
}
- rcu_read_unlock();
rq_clock_stop_loop_update(rq);
}
@@ -6951,8 +7586,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
#else /* !CONFIG_CFS_BANDWIDTH: */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -7409,7 +8043,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
- util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
if (dequeue_entities(rq, &p->se, flags) < 0)
return false;
@@ -7782,7 +8415,6 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p)
return -1;
}
-#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
@@ -7790,7 +8422,7 @@ static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
@@ -7799,7 +8431,7 @@ static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
@@ -7808,7 +8440,7 @@ static inline bool test_idle_cores(int cpu)
/*
* Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
+ * information in sd_balance_shared->has_idle_cores.
*
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
@@ -7838,7 +8470,8 @@ unlock:
/*
* Scan the entire LLC domain for idle cores; this dynamically switches off if
* there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * sd_balance_shared->has_idle_cores and enabled through update_idle_core()
+ * above.
*/
static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
@@ -7892,29 +8525,6 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
}
-#else /* !CONFIG_SCHED_SMT: */
-
-static inline void set_idle_cores(int cpu, int val)
-{
-}
-
-static inline bool test_idle_cores(int cpu)
-{
- return false;
-}
-
-static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
-{
- return __select_idle_cpu(core, p);
-}
-
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
- return -1;
-}
-
-#endif /* !CONFIG_SCHED_SMT */
-
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
@@ -7925,7 +8535,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
- if (sched_feat(SIS_UTIL)) {
+ if (sched_feat(SIS_UTIL) && sd->shared) {
/*
* Increment because !--nr is the condition to stop scan.
*
@@ -7990,6 +8600,54 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
}
/*
+ * Idle-capacity scan converts util_fits_cpu() outcomes into preference ranks,
+ * where lower values indicate a better fit - see select_idle_capacity().
+ *
+ * A CPU that both fits the task and sits on a fully-idle SMT core is returned
+ * immediately and is never assigned one of these ranks. On !SMT every CPU is
+ * its own "core", so the early return covers all fits-and-idle cases and the
+ * core-tier ranks below become unreachable.
+ *
+ * Rank Val Tier Meaning
+ * ------------------------------ --- ------ ---------------------------
+ * ASYM_IDLE_UCLAMP_MISFIT -4 core Idle core; capacity fits
+ * util but uclamp_min misses.
+ * ASYM_IDLE_COMPLETE_MISFIT -3 core Idle core; capacity does
+ * not fit. Still beats every
+ * thread-tier rank: a busy
+ * sibling cuts effective
+ * capacity more than a
+ * misfit hurts a quiet core.
+ * ASYM_IDLE_THREAD_FITS -2 thread Busy SMT sibling; capacity
+ * fits util + uclamp.
+ * ASYM_IDLE_THREAD_UCLAMP_MISFIT -1 thread Busy SMT sibling; capacity
+ * fits but uclamp_min misses
+ * (native util_fits_cpu()
+ * return value).
+ * ASYM_IDLE_THREAD_MISFIT 0 thread Busy SMT sibling; capacity
+ * does not fit.
+ *
+ * ASYM_IDLE_CORE_BIAS (-3) is an offset, not a state. On an idle core,
+ * fits += ASYM_IDLE_CORE_BIAS rebases thread-tier ranks into the core tier:
+ *
+ * ASYM_IDLE_THREAD_UCLAMP_MISFIT (-1) + BIAS -> ASYM_IDLE_UCLAMP_MISFIT (-4)
+ * ASYM_IDLE_THREAD_MISFIT (0) + BIAS -> ASYM_IDLE_COMPLETE_MISFIT (-3)
+ *
+ * ASYM_IDLE_THREAD_FITS (-2) is never rebased because a fully-fitting idle-core
+ * candidate early-returns from select_idle_capacity().
+ */
+enum asym_fits_state {
+ ASYM_IDLE_UCLAMP_MISFIT = -4,
+ ASYM_IDLE_COMPLETE_MISFIT,
+ ASYM_IDLE_THREAD_FITS,
+ ASYM_IDLE_THREAD_UCLAMP_MISFIT,
+ ASYM_IDLE_THREAD_MISFIT,
+
+ /* util_fits_cpu() bias for idle core */
+ ASYM_IDLE_CORE_BIAS = -3,
+};
+
+/*
* Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
* the task fits. If no CPU is big enough, but there are idle ones, try to
* maximize capacity.
@@ -7997,10 +8655,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
+ /*
+ * On !SMT systems, has_idle_core is always false and preferred_core
+ * is always true (CPU == core), so the SMT preference logic below
+ * collapses to the plain capacity scan.
+ */
+ bool has_idle_core = sched_smt_active() && test_idle_cores(target);
unsigned long task_util, util_min, util_max, best_cap = 0;
- int fits, best_fits = 0;
+ int fits, best_fits = ASYM_IDLE_THREAD_MISFIT;
int cpu, best_cpu = -1;
struct cpumask *cpus;
+ int nr = INT_MAX;
cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
@@ -8009,16 +8674,41 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
+ if (sched_feat(SIS_UTIL) && sd->shared) {
+ /*
+ * Same nr_idle_scan hint as select_idle_cpu(), nr only limits
+ * the scan when not preferring an idle core.
+ */
+ nr = READ_ONCE(sd->shared->nr_idle_scan) + 1;
+ /* overloaded domain is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
+ }
+
for_each_cpu_wrap(cpu, cpus, target) {
+ bool preferred_core = !has_idle_core || is_core_idle(cpu);
unsigned long cpu_cap = capacity_of(cpu);
+ /*
+ * Stop when the nr_idle_scan is exhausted (mirrors
+ * select_idle_cpu() logic).
+ */
+ if (!has_idle_core && --nr <= 0)
+ return best_cpu;
+
if (!choose_idle_cpu(cpu, p))
continue;
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
- /* This CPU fits with all requirements */
- if (fits > 0)
+ /*
+ * Perfect fit: capacity satisfies util + uclamp and the CPU
+ * sits on a fully-idle SMT core, this is a !SMT system, or
+ * there is no idle core to find.
+ * Short-circuit the rank-based selection and return
+ * immediately.
+ */
+ if (fits > 0 && preferred_core)
return cpu;
/*
* Only the min performance hint (i.e. uclamp_min) doesn't fit.
@@ -8026,9 +8716,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
*/
else if (fits < 0)
cpu_cap = get_actual_cpu_capacity(cpu);
+ /*
+ * fits > 0 implies we are not on a preferred core, but the util
+ * fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS
+ * so the effective range becomes
+ * [ASYM_IDLE_THREAD_FITS, ASYM_IDLE_THREAD_MISFIT], where:
+ * ASYM_IDLE_THREAD_MISFIT - does not fit
+ * ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN
+ * ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core
+ */
+ else if (fits > 0)
+ fits = ASYM_IDLE_THREAD_FITS;
/*
- * First, select CPU which fits better (-1 being better than 0).
+ * If we are on a preferred core, translate the range of fits
+ * of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_THREAD_MISFIT] to
+ * [ASYM_IDLE_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT].
+ * This ensures that an idle core is always given priority over
+ * (partially) busy core.
+ *
+ * A fully fitting idle core would have returned early and hence
+ * fits > 0 for preferred_core need not be dealt with.
+ */
+ if (preferred_core)
+ fits += ASYM_IDLE_CORE_BIAS;
+
+ /*
+ * First, select CPU which fits better (lower is more preferred).
* Then, select the one with best capacity at same level.
*/
if ((fits < best_fits) ||
@@ -8039,6 +8753,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
}
}
+ /*
+ * A value in the [ASYM_IDLE_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT]
+ * range means the chosen CPU is in a fully idle SMT core. Values above
+ * ASYM_IDLE_COMPLETE_MISFIT mean we never ranked such a CPU best.
+ *
+ * The asym-capacity wakeup path returns from select_idle_sibling()
+ * after this function and never runs select_idle_cpu(), so the usual
+ * select_idle_cpu() tail that clears idle cores must live here when the
+ * idle-core preference did not win.
+ */
+ if (has_idle_core && best_fits > ASYM_IDLE_COMPLETE_MISFIT)
+ set_idle_cores(target, false);
+
return best_cpu;
}
@@ -8047,12 +8774,22 @@ static inline bool asym_fits_cpu(unsigned long util,
unsigned long util_max,
int cpu)
{
- if (sched_asym_cpucap_active())
+ if (sched_asym_cpucap_active()) {
/*
* Return true only if the cpu fully fits the task requirements
* which include the utilization and the performance hints.
+ *
+ * When SMT is active, also require that the core has no busy
+ * siblings.
+ *
+ * Note: gating on is_core_idle() also makes the early-bailout
+ * candidates in select_idle_sibling() (target, prev,
+ * recent_used_cpu) idle-core-aware on ASYM+SMT, which the
+ * NO_ASYM path does not do.
*/
- return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+ return (!sched_smt_active() || is_core_idle(cpu)) &&
+ (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+ }
return true;
}
@@ -8231,25 +8968,32 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
static unsigned long
cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{
+ bool add_task = p && task_cpu(p) != cpu && dst_cpu == cpu;
+ bool sub_task = p && task_cpu(p) == cpu && dst_cpu != cpu;
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
unsigned long runnable;
- if (boost) {
- runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
- util = max(util, runnable);
- }
-
/*
* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
* contribution. If @p migrates from another CPU to @cpu add its
* contribution. In all the other cases @cpu is not impacted by the
* migration so its util_avg is already correct.
*/
- if (p && task_cpu(p) == cpu && dst_cpu != cpu)
- lsub_positive(&util, task_util(p));
- else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
+ if (add_task)
util += task_util(p);
+ else if (sub_task)
+ lsub_positive(&util, task_util(p));
+
+ if (boost) {
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ if (add_task)
+ runnable += READ_ONCE(p->se.avg.runnable_avg);
+ else if (sub_task)
+ lsub_positive(&runnable,
+ READ_ONCE(p->se.avg.runnable_avg));
+ util = max(util, runnable);
+ }
if (sched_feat(UTIL_EST)) {
unsigned long util_est;
@@ -9145,9 +9889,10 @@ pick:
/*
* Because p is enqueued, nse being null can only mean that we
- * dequeued a delayed task.
+ * dequeued a delayed task. If there are still entities queued in
+ * cfs, check if the next one will be p.
*/
- if (!nse)
+ if (!nse && cfs_rq->nr_queued)
goto pick;
if (sched_feat(RUN_TO_PARITY))
@@ -9164,17 +9909,19 @@ preempt:
resched_curr_lazy(rq);
}
-static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
+struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
+ __must_hold(__rq_lockp(rq))
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
struct task_struct *p;
bool throttled;
+ int new_tasks;
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_queued)
- return NULL;
+ goto idle;
throttled = false;
@@ -9183,8 +9930,6 @@ again:
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- throttled |= check_cfs_rq_runtime(cfs_rq);
-
se = pick_next_entity(rq, cfs_rq, true);
if (!se)
goto again;
@@ -9195,95 +9940,22 @@ again:
if (unlikely(throttled))
task_throttle_setup_work(p);
return p;
-}
-
-static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
-
-struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- __must_hold(__rq_lockp(rq))
-{
- struct sched_entity *se;
- struct task_struct *p;
- int new_tasks;
-
-again:
- p = pick_task_fair(rq, rf);
- if (!p)
- goto idle;
- se = &p->se;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
- goto simple;
-
- __put_prev_set_next_dl_server(rq, prev, p);
-
- /*
- * Because of the set_next_buddy() in dequeue_task_fair() it is rather
- * likely that a next task is from the same cgroup as the current.
- *
- * Therefore attempt to avoid putting and setting the entire cgroup
- * hierarchy, only change the part that actually changes.
- *
- * Since we haven't yet done put_prev_entity and if the selected task
- * is a different task than we started out with, try and touch the
- * least amount of cfs_rqs.
- */
- if (prev != p) {
- struct sched_entity *pse = &prev->se;
- struct cfs_rq *cfs_rq;
-
- while (!(cfs_rq = is_same_group(se, pse))) {
- int se_depth = se->depth;
- int pse_depth = pse->depth;
-
- if (se_depth <= pse_depth) {
- put_prev_entity(cfs_rq_of(pse), pse);
- pse = parent_entity(pse);
- }
- if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se, true);
- se = parent_entity(se);
- }
- }
-
- put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se, true);
-
- __set_next_task_fair(rq, p, true);
- }
-
- return p;
-
-simple:
-#endif /* CONFIG_FAIR_GROUP_SCHED */
- put_prev_set_next_task(rq, prev, p);
- return p;
idle:
- if (rf) {
- new_tasks = sched_balance_newidle(rq, rf);
-
- /*
- * Because sched_balance_newidle() releases (and re-acquires)
- * rq->lock, it is possible for any higher priority task to
- * appear. In that case we must re-start the pick_next_entity()
- * loop.
- */
- if (new_tasks < 0)
- return RETRY_TASK;
-
- if (new_tasks > 0)
- goto again;
- }
+ if (sched_core_enabled(rq))
+ return NULL;
+ new_tasks = sched_balance_newidle(rq, rf);
+ if (new_tasks < 0)
+ return RETRY_TASK;
+ if (new_tasks > 0)
+ goto again;
return NULL;
}
static struct task_struct *
fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
+ __must_hold(__rq_lockp(dl_se->rq))
{
return pick_task_fair(dl_se->rq, rf);
}
@@ -9304,10 +9976,33 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
+ struct sched_entity *nse = NULL;
- for_each_sched_entity(se) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (next && next->sched_class == &fair_sched_class)
+ nse = &next->se;
+#endif
+
+ while (se) {
cfs_rq = cfs_rq_of(se);
- put_prev_entity(cfs_rq, se);
+ if (!nse || cfs_rq->curr)
+ put_prev_entity(cfs_rq, se);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (nse) {
+ if (is_same_group(se, nse))
+ break;
+
+ int d = nse->depth - se->depth;
+ if (d >= 0) {
+ /* nse has equal or greater depth, ascend */
+ nse = parent_entity(nse);
+ /* if nse is the deeper, do not ascend se */
+ if (d > 0)
+ continue;
+ }
+ }
+#endif
+ se = parent_entity(se);
}
}
@@ -9529,6 +10224,16 @@ enum group_type {
*/
group_imbalanced,
/*
+ * There are tasks running on non-preferred LLC, possible to move
+ * them to their preferred LLC without creating too much imbalance.
+ * The priority of group_llc_balance is lower than that of
+ * group_overloaded and higher than that of all other group types.
+ * This is because group_llc_balance may exacerbate load imbalance.
+ * If the LLC balancing attempt fails, the nr_balance_failed
+ * mechanism will trigger other group types to rebalance the load.
+ */
+ group_llc_balance,
+ /*
* The CPU is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
@@ -9539,7 +10244,8 @@ enum migration_type {
migrate_load = 0,
migrate_util,
migrate_task,
- migrate_misfit
+ migrate_misfit,
+ migrate_llc_task
};
#define LBF_ALL_PINNED 0x01
@@ -9547,6 +10253,7 @@ enum migration_type {
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_ACTIVE_LB 0x10
+#define LBF_LLC_PINNED 0x20
struct lb_env {
struct sched_domain *sd;
@@ -9556,6 +10263,7 @@ struct lb_env {
int dst_cpu;
struct rq *dst_rq;
+ bool dst_core_idle;
struct cpumask *dst_grpmask;
int new_dst_cpu;
@@ -9692,7 +10400,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
struct cfs_rq *dst_cfs_rq;
#ifdef CONFIG_FAIR_GROUP_SCHED
- dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
+ dst_cfs_rq = tg_cfs_rq(task_group(p), dest_cpu);
#else
dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
#endif
@@ -9703,6 +10411,298 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
return 0;
}
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * The margin used when comparing LLC utilization with CPU capacity.
+ * It determines the LLC load level where active LLC aggregation is
+ * done.
+ * Derived from fits_capacity().
+ *
+ * (default: ~50%, tunable via debugfs)
+ */
+static bool fits_llc_capacity(unsigned long util, unsigned long max)
+{
+ u32 aggr_pct = llc_overaggr_pct;
+
+ /*
+ * For single core systems, raise the aggregation
+ * threshold to accommodate more tasks.
+ */
+ if (cpu_smt_num_threads == 1)
+ aggr_pct = (aggr_pct * 3 / 2);
+
+ return util * 100 < max * aggr_pct;
+}
+
+/*
+ * The margin used when comparing utilization.
+ * is 'util1' noticeably greater than 'util2'
+ * Derived from capacity_greater().
+ * Bias is in perentage.
+ */
+/* Allows dst util to be bigger than src util by up to bias percent */
+#define util_greater(util1, util2) \
+ ((util1) * 100 > (util2) * (100 + llc_imb_pct))
+
+static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ unsigned long *cap)
+{
+ struct sched_domain_shared *sd_share;
+
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ if (!sd_share)
+ return false;
+
+ *util = READ_ONCE(sd_share->util_avg);
+ *cap = READ_ONCE(sd_share->capacity);
+
+ return true;
+}
+
+/*
+ * Decision matrix according to the LLC utilization. To
+ * decide whether we can do task aggregation across LLC.
+ *
+ * By default, 50% is the threshold for treating the LLC
+ * as busy. The reason for choosing 50% is to avoid saturation
+ * of SMT-2, and it is also a safe cutoff for other SMT-n
+ * platforms. SMT-1 has higher threshold because it is
+ * supposed to accommodate more tasks, see fits_llc_capacity().
+ *
+ * 20% is the utilization imbalance percentage to decide
+ * if the preferred LLC is busier than the non-preferred LLC.
+ * 20 is a little higher than the LLC domain's imbalance_pct
+ * 17. The hysteresis is used to avoid task bouncing between the
+ * preferred LLC and the non-preferred LLC, and it will
+ * be turned into tunable debugfs.
+ *
+ * 1. moving towards the preferred LLC, dst is the preferred
+ * LLC, src is not.
+ *
+ * src \ dst 30% 40% 50% 60%
+ * 30% Y Y Y N
+ * 40% Y Y Y Y
+ * 50% Y Y G G
+ * 60% Y Y G G
+ *
+ * 2. moving out of the preferred LLC, src is the preferred
+ * LLC, dst is not:
+ *
+ * src \ dst 30% 40% 50% 60%
+ * 30% N N N N
+ * 40% N N N N
+ * 50% N N G G
+ * 60% Y N G G
+ *
+ * src : src_util
+ * dst : dst_util
+ * Y : Yes, migrate
+ * N : No, do not migrate
+ * G : let the Generic load balance to even the load.
+ *
+ * The intention is that if both LLCs are quite busy, cache aware
+ * load balance should not be performed, and generic load balance
+ * should take effect. However, if one is busy and the other is not,
+ * the preferred LLC capacity(50%) and imbalance criteria(20%) should
+ * be considered to determine whether LLC aggregation should be
+ * performed to bias the load towards the preferred LLC.
+ */
+
+/* migration decision, 3 states are orthogonal. */
+enum llc_mig {
+ mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */
+ mig_llc, /* Y: Do LLC preference based migration */
+ mig_unrestricted /* G: Don't restrict generic load balance migration */
+};
+
+/*
+ * Check if task can be moved from the source LLC to the
+ * destination LLC without breaking cache aware preferrence.
+ * src_cpu and dst_cpu are arbitrary CPUs within the source
+ * and destination LLCs, respectively.
+ */
+static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
+ unsigned long tsk_util,
+ bool to_pref)
+{
+ unsigned long src_util, dst_util, src_cap, dst_cap;
+
+ if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
+ !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
+ return mig_unrestricted;
+
+ src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
+ dst_util = dst_util + tsk_util;
+
+ if (!fits_llc_capacity(dst_util, dst_cap) &&
+ !fits_llc_capacity(src_util, src_cap))
+ return mig_unrestricted;
+
+ if (to_pref) {
+ /*
+ * Don't migrate if we will get preferred LLC too
+ * heavily loaded and if the dest is much busier
+ * than the src, in which case migration will
+ * increase the imbalance too much.
+ */
+ if (!fits_llc_capacity(dst_util, dst_cap) &&
+ util_greater(dst_util, src_util))
+ return mig_forbid;
+ } else {
+ /*
+ * Don't migrate if we will leave preferred LLC
+ * too idle, or if this migration leads to the
+ * non-preferred LLC falls within sysctl_aggr_imb percent
+ * of preferred LLC, leading to migration again
+ * back to preferred LLC.
+ */
+ if (fits_llc_capacity(src_util, src_cap) ||
+ !util_greater(src_util, dst_util))
+ return mig_forbid;
+ }
+ return mig_llc;
+}
+
+/*
+ * Check if task p can migrate from source LLC to
+ * destination LLC in terms of cache aware load balance.
+ */
+static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ bool to_pref;
+ int cpu;
+
+ mm = p->mm;
+ if (!mm)
+ return mig_unrestricted;
+
+ cpu = READ_ONCE(mm->sc_stat.cpu);
+ if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ return mig_unrestricted;
+
+ /* skip cache aware load balance for too many threads */
+ if (invalid_llc_nr(mm, p, dst_cpu) ||
+ exceed_llc_capacity(mm, dst_cpu)) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+ return mig_unrestricted;
+ }
+
+ if (cpus_share_cache(dst_cpu, cpu))
+ to_pref = true;
+ else if (cpus_share_cache(src_cpu, cpu))
+ to_pref = false;
+ else
+ return mig_unrestricted;
+
+ return can_migrate_llc(src_cpu, dst_cpu,
+ task_util(p), to_pref);
+}
+
+/*
+ * Check if active load balance breaks LLC locality in
+ * terms of cache aware load balance. The load level and
+ * imbalance do not warrant breaking LLC preference per
+ * the can_migrate_llc() policy. Here, the benefit of
+ * LLC locality outweighs the power efficiency gained from
+ * migrating the only runnable task away.
+ */
+static inline bool
+alb_break_llc(struct lb_env *env)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (cpus_share_cache(env->src_cpu, env->dst_cpu))
+ return false;
+ /*
+ * All tasks prefer to stay on their current CPU.
+ * Do not pull a task from its preferred CPU if:
+ * 1. It is the only task running and does not exceed
+ * imbalance allowance; OR
+ * 2. Migrating it away from its preferred LLC would violate
+ * the cache-aware scheduling policy.
+ */
+ if (env->src_rq->nr_pref_llc_running &&
+ env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) {
+ unsigned long util = 0;
+ struct task_struct *cur;
+
+ if (env->src_rq->nr_running <= 1)
+ return true;
+
+ cur = rcu_dereference_all(env->src_rq->curr);
+ if (cur && cur->sched_class == &fair_sched_class)
+ util = task_util(cur);
+
+ if (can_migrate_llc(env->src_cpu, env->dst_cpu,
+ util, false) == mig_forbid)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check if migrating task p from env->src_cpu to
+ * env->dst_cpu breaks LLC localiy.
+ */
+static bool migrate_degrades_llc(struct task_struct *p, struct lb_env *env)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (task_has_sched_core(p))
+ return false;
+ /*
+ * Skip over tasks that would degrade LLC locality;
+ * only when nr_balanced_failed is sufficiently high do we
+ * ignore this constraint.
+ *
+ * Threshold of cache_nice_tries is set to 1 higher
+ * than nr_balance_failed to avoid excessive task
+ * migration at the same time.
+ */
+ if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1)
+ return false;
+
+ /*
+ * We know the env->src_cpu has some tasks prefer to
+ * run on env->dst_cpu, skip the tasks do not prefer
+ * env->dst_cpu, and find the one that prefers.
+ */
+ if (env->migration_type == migrate_llc_task &&
+ READ_ONCE(p->preferred_llc) != llc_id(env->dst_cpu))
+ return true;
+
+ if (can_migrate_llc_task(env->src_cpu,
+ env->dst_cpu, p) != mig_forbid)
+ return false;
+
+ return true;
+}
+
+#else
+static inline bool get_llc_stats(int cpu, unsigned long *util,
+ unsigned long *cap)
+{
+ return false;
+}
+
+static inline bool
+alb_break_llc(struct lb_env *env)
+{
+ return false;
+}
+
+static inline bool
+migrate_degrades_llc(struct task_struct *p, struct lb_env *env)
+{
+ return false;
+}
+#endif
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
@@ -9799,10 +10799,29 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 1;
degrades = migrate_degrades_locality(p, env);
- if (!degrades)
+ if (!degrades) {
+ /*
+ * If the NUMA locality is not broken,
+ * further check if migration would hurt
+ * LLC locality.
+ */
+ if (migrate_degrades_llc(p, env)) {
+ /*
+ * If regular load balancing fails to pull a task
+ * due to LLC locality, this is expected behavior
+ * and we set LBF_LLC_PINNED so we don't increase
+ * nr_balance_failed unecessarily.
+ */
+ if (env->migration_type != migrate_llc_task)
+ env->flags |= LBF_LLC_PINNED;
+
+ return 0;
+ }
+
hot = task_hot(p, env);
- else
+ } else {
hot = degrades > 0;
+ }
if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (hot)
@@ -9964,6 +10983,10 @@ static int detach_tasks(struct lb_env *env)
env->imbalance = 0;
break;
+
+ case migrate_llc_task:
+ env->imbalance--;
+ break;
}
detach_task(p, env);
@@ -10097,7 +11120,6 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq, *pos;
bool decayed = false;
- int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
@@ -10117,7 +11139,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
}
/* Propagate pending load changes to the parent, if any: */
- se = cfs_rq->tg->se[cpu];
+ se = cfs_rq_se(cfs_rq);
if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
@@ -10143,8 +11165,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
*/
static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
- struct rq *rq = rq_of(cfs_rq);
- struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+ struct sched_entity *se = cfs_rq_se(cfs_rq);
unsigned long now = jiffies;
unsigned long load;
@@ -10242,12 +11263,16 @@ struct sg_lb_stats {
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ unsigned int nr_pref_dst_llc;
+#endif
};
/*
@@ -10505,6 +11530,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
+ if (sgs->group_llc_balance)
+ return group_llc_balance;
+
if (sg_imbalanced(group))
return group_imbalanced;
@@ -10659,6 +11687,105 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
return check_cpu_capacity(rq, sd);
}
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * Record the statistics for this scheduler group for later
+ * use. These values guide load balancing on aggregating tasks
+ * to a LLC.
+ */
+static void record_sg_llc_stats(struct lb_env *env,
+ struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ struct sched_domain_shared *sd_share;
+ int cpu;
+
+ if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ /* Only care about sched domain spanning multiple LLCs */
+ if (env->sd->child != rcu_dereference_all(per_cpu(sd_llc, env->dst_cpu)))
+ return;
+
+ /*
+ * At this point we know this group spans a LLC domain.
+ * Record the statistic of this group in its corresponding
+ * shared LLC domain.
+ * Note: sd_share cannot be obtained via sd->child->shared,
+ * because the latter refers to the domain that covers the
+ * local group. Instead, sd_share should be located using
+ * the first CPU of the LLC group.
+ */
+ cpu = cpumask_first(sched_group_span(group));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ if (!sd_share)
+ return;
+
+ if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
+ WRITE_ONCE(sd_share->util_avg, sgs->group_util);
+
+ if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
+ WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
+}
+
+/*
+ * Do LLC balance on sched group that contains LLC, and have tasks preferring
+ * to run on LLC in idle dst_cpu.
+ */
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (env->sd->flags & SD_SHARE_LLC)
+ return false;
+
+ /*
+ * Skip cache aware tagging if nr_balanced_failed is sufficiently high.
+ * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed
+ * to avoid excessive task migration at the same time.
+ */
+ if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1)
+ return false;
+
+ if (sgs->nr_pref_dst_llc &&
+ can_migrate_llc(cpumask_first(sched_group_span(group)),
+ env->dst_cpu, 0, true) == mig_llc)
+ return true;
+
+ return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *sgs)
+{
+ /*
+ * There are more tasks that want to run on dst_cpu's LLC.
+ */
+ return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc;
+}
+#else
+static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+}
+
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *sgs)
+{
+ return false;
+}
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -10695,6 +11822,20 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (cpu_overutilized(i))
sgs->group_overutilized = 1;
+#ifdef CONFIG_SCHED_CACHE
+ if (sched_cache_enabled()) {
+ struct sched_domain *sd_tmp;
+ int dst_llc;
+
+ dst_llc = llc_id(env->dst_cpu);
+ if (llc_id(i) != dst_llc) {
+ sd_tmp = rcu_dereference_all(rq->sd);
+ if (sd_tmp && (unsigned int)dst_llc < sd_tmp->llc_max)
+ sgs->nr_pref_dst_llc += sd_tmp->llc_counts[dst_llc];
+ }
+ }
+#endif
+
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -10735,17 +11876,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
- /* Check if dst CPU is idle and preferred to this group */
- if (!local_group && env->idle && sgs->sum_h_nr_running &&
- sched_group_asym(env, sgs, group))
- sgs->group_asym_packing = 1;
+ if (!local_group) {
+ /* Check if dst CPU is idle and preferred to this group */
+ if (env->idle && sgs->sum_h_nr_running &&
+ sched_group_asym(env, sgs, group))
+ sgs->group_asym_packing = 1;
+
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
- /* Check for loaded SMT group to be balanced to dst CPU */
- if (!local_group && smt_balance(env, sgs, group))
- sgs->group_smt_balance = 1;
+ /* Check for tasks in this group can be moved to their preferred LLC */
+ if (llc_balance(env, sgs, group))
+ sgs->group_llc_balance = 1;
+ }
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ record_sg_llc_stats(env, sgs, group);
/* Computing avg_load makes sense only when group is overloaded */
if (sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
@@ -10781,10 +11929,16 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* We can use max_capacity here as reduction in capacity on some
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
+ *
+ * When SMT is active, only pull a misfit to dst_cpu if it is on a
+ * fully idle core; otherwise the effective capacity of the core is
+ * reduced and we may not actually provide more capacity than the
+ * source.
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type == group_misfit_task) &&
- (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
+ (!env->dst_core_idle ||
+ !capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -10804,6 +11958,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
/* Select the overloaded group with highest avg_load. */
return sgs->avg_load > busiest->avg_load;
+ case group_llc_balance:
+ /* Select the group with most tasks preferring dst LLC */
+ return update_llc_busiest(env, busiest, sgs);
+
case group_imbalanced:
/*
* Select the 1st imbalanced group as we don't have any way to
@@ -11066,6 +12224,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
return false;
break;
+ case group_llc_balance:
case group_imbalanced:
case group_asym_packing:
case group_smt_balance:
@@ -11198,6 +12357,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
return NULL;
break;
+ case group_llc_balance:
case group_imbalanced:
case group_asym_packing:
case group_smt_balance:
@@ -11348,6 +12508,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
unsigned long sum_util = 0;
bool sg_overloaded = 0, sg_overutilized = 0;
+ env->dst_core_idle = !sched_smt_active() || is_core_idle(env->dst_cpu);
+
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
@@ -11450,6 +12612,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+#ifdef CONFIG_SCHED_CACHE
+ if (busiest->group_type == group_llc_balance) {
+ /* Move a task that prefer local LLC */
+ env->migration_type = migrate_llc_task;
+ env->imbalance = 1;
+ return;
+ }
+#endif
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -11696,7 +12867,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
* group's child domain.
*/
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- sibling_imbalance(env, &sds, busiest, local) > 1)
+ (busiest->group_type == group_llc_balance ||
+ sibling_imbalance(env, &sds, busiest, local) > 1))
goto force_balance;
if (busiest->group_type != group_overloaded) {
@@ -11755,7 +12927,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ unsigned int __maybe_unused busiest_pref_llc = 0;
+ struct sched_domain __maybe_unused *sd_tmp;
unsigned int busiest_nr = 0;
+ int __maybe_unused dst_llc;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
@@ -11883,6 +13058,23 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
break;
+ case migrate_llc_task:
+#ifdef CONFIG_SCHED_CACHE
+ sd_tmp = rcu_dereference_all(rq->sd);
+ dst_llc = llc_id(env->dst_cpu);
+
+ if (sd_tmp && (unsigned)dst_llc < sd_tmp->llc_max) {
+ unsigned int this_pref_llc =
+ sd_tmp->llc_counts[dst_llc];
+
+ if (busiest_pref_llc < this_pref_llc) {
+ busiest_pref_llc = this_pref_llc;
+ busiest = rq;
+ }
+ }
+#endif
+ break;
+
}
}
@@ -11934,6 +13126,9 @@ static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
+ if (alb_break_llc(env))
+ return 0;
+
if (asym_active_balance(env))
return 1;
@@ -11953,7 +13148,8 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
- if (env->migration_type == migrate_misfit)
+ if (env->migration_type == migrate_misfit ||
+ env->migration_type == migrate_llc_task)
return 1;
return 0;
@@ -11998,7 +13194,9 @@ static int should_we_balance(struct lb_env *env)
* balancing cores, but remember the first idle SMT CPU for
* later consideration. Find CPU on an idle core first.
*/
- if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+ if (sched_smt_active() &&
+ !(env->sd->flags & SD_SHARE_CPUCAPACITY) &&
+ !is_core_idle(cpu)) {
if (idle_smt == -1)
idle_smt = cpu;
/*
@@ -12006,9 +13204,7 @@ static int should_we_balance(struct lb_env *env)
* idle has been found, then its not needed to check other
* SMT siblings for idleness:
*/
-#ifdef CONFIG_SCHED_SMT
cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
-#endif
continue;
}
@@ -12046,6 +13242,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
case migrate_misfit:
__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
break;
+ case migrate_llc_task:
+ break;
}
}
@@ -12249,9 +13447,16 @@ more_balance:
*
* Similarly for migration_misfit which is not related to
* load/util migration, don't pollute nr_balance_failed.
+ *
+ * The same for cache aware scheduling's allowance for
+ * load imbalance. If regular load balance does not
+ * migrate task due to LLC locality, it is a expected
+ * behavior and don't pollute nr_balance_failed.
+ * See can_migrate_task().
*/
if (idle != CPU_NEWLY_IDLE &&
- env.migration_type != migrate_misfit)
+ env.migration_type != migrate_misfit &&
+ !(env.flags & LBF_LLC_PINNED))
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@@ -12755,8 +13960,6 @@ static void nohz_balancer_kick(struct rq *rq)
goto out;
}
- rcu_read_lock();
-
sd = rcu_dereference_all(rq->sd);
if (sd) {
/*
@@ -12764,8 +13967,8 @@ static void nohz_balancer_kick(struct rq *rq)
* capacity, kick the ILB to see if there's a better CPU to run on:
*/
if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
- goto unlock;
+ flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ goto out;
}
}
@@ -12781,8 +13984,8 @@ static void nohz_balancer_kick(struct rq *rq)
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym(sd, i, cpu)) {
- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
- goto unlock;
+ flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ goto out;
}
}
}
@@ -12793,10 +13996,8 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq)) {
- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
- goto unlock;
- }
+ if (check_misfit_status(rq))
+ flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
/*
* For asymmetric systems, we do not want to nicely balance
@@ -12805,10 +14006,10 @@ static void nohz_balancer_kick(struct rq *rq)
*
* Skip the LLC logic because it's not relevant in that case.
*/
- goto unlock;
+ goto out;
}
- sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
@@ -12820,13 +14021,9 @@ static void nohz_balancer_kick(struct rq *rq)
* like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
- if (nr_busy > 1) {
- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
- goto unlock;
- }
+ if (nr_busy > 1)
+ flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
}
-unlock:
- rcu_read_unlock();
out:
if (READ_ONCE(nohz.needs_update))
flags |= NOHZ_NEXT_KICK;
@@ -12838,17 +14035,17 @@ out:
static void set_cpu_sd_state_busy(int cpu)
{
struct sched_domain *sd;
-
- rcu_read_lock();
sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
- if (!sd || !sd->nohz_idle)
- goto unlock;
+ /*
+ * sd->nohz_idle only pairs with nr_busy_cpus on sd->shared; if this
+ * domain has no shared object there is nothing to clear or account.
+ */
+ if (!sd || !sd->shared || !sd->nohz_idle)
+ return;
sd->nohz_idle = 0;
atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
}
void nohz_balance_exit_idle(struct rq *rq)
@@ -12867,17 +14064,14 @@ void nohz_balance_exit_idle(struct rq *rq)
static void set_cpu_sd_state_idle(int cpu)
{
struct sched_domain *sd;
-
- rcu_read_lock();
sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
- if (!sd || sd->nohz_idle)
- goto unlock;
+ /* See set_cpu_sd_state_busy(): nohz_idle is only used with sd->shared. */
+ if (!sd || !sd->shared || sd->nohz_idle)
+ return;
sd->nohz_idle = 1;
atomic_dec(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
}
/*
@@ -13636,7 +14830,7 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
struct cfs_rq *cfs_rq;
#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq = task_group(p)->cfs_rq[cpu];
+ cfs_rq = tg_cfs_rq(task_group(p), cpu);
#else
cfs_rq = &cpu_rq(cpu)->cfs;
#endif
@@ -13656,8 +14850,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
+ struct cfs_rq *cfs_rq;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -13670,6 +14864,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+ task_tick_cache(rq, curr);
+
update_misfit_status(curr, rq);
check_update_overutilized_status(task_rq(curr));
@@ -13828,9 +15024,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
}
}
-static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+/*
+ * Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
+ bool throttled = false;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) &&
+ first && cfs_rq->curr)
+ break;
+
+ set_next_entity(cfs_rq, se, first);
+ /* ensure bandwidth has been allocated on our new cfs_rq */
+ throttled |= account_cfs_rq_runtime(cfs_rq, 0);
+ }
+
+ if (throttled)
+ task_throttle_setup_work(p);
+
+ se = &p->se;
if (task_on_rq_queued(p)) {
/*
@@ -13851,27 +15071,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
sched_fair_update_stop_tick(rq, p);
}
-/*
- * Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
-{
- struct sched_entity *se = &p->se;
-
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- set_next_entity(cfs_rq, se, first);
- /* ensure bandwidth has been allocated on our new cfs_rq */
- account_cfs_rq_runtime(cfs_rq, 0);
- }
-
- __set_next_task_fair(rq, p, first);
-}
-
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
@@ -13899,56 +15098,38 @@ static void task_change_group_fair(struct task_struct *p)
void free_fair_sched_group(struct task_group *tg)
{
- int i;
-
- for_each_possible_cpu(i) {
- if (tg->cfs_rq)
- kfree(tg->cfs_rq[i]);
- if (tg->se)
- kfree(tg->se[i]);
- }
-
- kfree(tg->cfs_rq);
- kfree(tg->se);
+ free_percpu(tg->cfs_rq);
}
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
+ struct cfs_tg_state __percpu *state;
struct sched_entity *se;
struct cfs_rq *cfs_rq;
int i;
- tg->cfs_rq = kzalloc_objs(cfs_rq, nr_cpu_ids);
- if (!tg->cfs_rq)
- goto err;
- tg->se = kzalloc_objs(se, nr_cpu_ids);
- if (!tg->se)
+ state = alloc_percpu_gfp(struct cfs_tg_state, GFP_KERNEL);
+ if (!state)
goto err;
+ tg->cfs_rq = &state->cfs_rq;
tg->shares = NICE_0_LOAD;
init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
for_each_possible_cpu(i) {
- cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
- GFP_KERNEL, cpu_to_node(i));
+ cfs_rq = tg_cfs_rq(tg, i);
if (!cfs_rq)
goto err;
- se = kzalloc_node(sizeof(struct sched_entity_stats),
- GFP_KERNEL, cpu_to_node(i));
- if (!se)
- goto err_free_rq;
-
+ se = tg_se(tg, i);
init_cfs_rq(cfs_rq);
- init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, tg_se(parent, i));
init_entity_runnable_average(se);
}
return 1;
-err_free_rq:
- kfree(cfs_rq);
err:
return 0;
}
@@ -13962,7 +15143,7 @@ void online_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) {
rq = cpu_rq(i);
- se = tg->se[i];
+ se = tg_se(tg, i);
rq_lock_irq(rq, &rf);
update_rq_clock(rq);
attach_entity_cfs_rq(se);
@@ -13978,8 +15159,8 @@ void unregister_fair_sched_group(struct task_group *tg)
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) {
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
- struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu);
+ struct sched_entity *se = tg_se(tg, cpu);
struct rq *rq = cpu_rq(cpu);
if (se) {
@@ -14015,9 +15196,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
cfs_rq->rq = rq;
init_cfs_rq_runtime(cfs_rq);
- tg->cfs_rq[cpu] = cfs_rq;
- tg->se[cpu] = se;
-
/* se could be NULL for root_task_group */
if (!se)
return;
@@ -14047,7 +15225,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
/*
* We can't change the weight of the root cgroup.
*/
- if (!tg->se[0])
+ if (is_root_task_group(tg))
return -EINVAL;
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
@@ -14058,7 +15236,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
tg->shares = shares;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
- struct sched_entity *se = tg->se[i];
+ struct sched_entity *se = tg_se(tg, i);
struct rq_flags rf;
/* Propagate contribution to hierarchy */
@@ -14109,8 +15287,8 @@ int sched_group_set_idle(struct task_group *tg, long idle)
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
- struct sched_entity *se = tg->se[i];
- struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+ struct sched_entity *se = tg_se(tg, i);
+ struct cfs_rq *grp_cfs_rq = tg_cfs_rq(tg, i);
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
long idle_task_delta;
struct rq_flags rf;
@@ -14183,7 +15361,6 @@ DEFINE_SCHED_CLASS(fair) = {
.wakeup_preempt = wakeup_preempt_fair,
.pick_task = pick_task_fair,
- .pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 84c4fe3abd74..8f0dee8fc475 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -110,8 +110,16 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
* rq lock and possibly create a large contention, sending an
* IPI to that CPU and let that CPU push the RT task to where
* it should go may be a better scenario.
+ *
+ * This is best for PREEMPT_RT, but for non-RT it can cause issues
+ * when preemption is disabled for long periods of time. Have
+ * it only default enabled for PREEMPT_RT.
*/
+# ifdef CONFIG_PREEMPT_RT
SCHED_FEAT(RT_PUSH_IPI, true)
+# else
+SCHED_FEAT(RT_PUSH_IPI, false)
+# endif
#endif
SCHED_FEAT(RT_RUNTIME_SHARE, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index a83be0c834dd..052435f4d3e3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -280,6 +280,14 @@ static void do_idle(void)
int cpu = smp_processor_id();
bool got_tick = false;
+ if (cpu_is_offline(cpu)) {
+ local_irq_disable();
+ /* All per-CPU kernel threads should be done by now. */
+ WARN_ON_ONCE(need_resched());
+ cpuhp_report_idle_dead();
+ arch_cpu_idle_dead();
+ }
+
/*
* Check if we need to update blocked load
*/
@@ -331,11 +339,6 @@ static void do_idle(void)
*/
local_irq_disable();
- if (cpu_is_offline(cpu)) {
- cpuhp_report_idle_dead();
- arch_cpu_idle_dead();
- }
-
arch_cpu_idle_enter();
rcu_nocb_flush_deferred_wakeup();
@@ -462,7 +465,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int flags)
}
static int
-balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+balance_idle(struct rq *rq, struct rq_flags *rf)
{
return WARN_ON_ONCE(1);
}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 623445603725..cb957b8f1946 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -164,8 +164,26 @@
| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
| MEMBARRIER_CMD_GET_REGISTRATIONS)
+/*
+ * Scoped guard for memory barriers on entry and exit.
+ * Matches memory barriers before & after rq->curr modification in scheduler.
+ */
+DEFINE_LOCK_GUARD_0(mb, smp_mb(), smp_mb())
static DEFINE_MUTEX(membarrier_ipi_mutex);
+static DEFINE_PER_CPU(struct mutex, membarrier_cpu_mutexes);
+
#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
+#define SERIALIZE_IPI_CPU(cpu_id) guard(mutex)(&per_cpu(membarrier_cpu_mutexes, cpu_id))
+
+static int __init membarrier_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ mutex_init(&per_cpu(membarrier_cpu_mutexes, i));
+ return 0;
+}
+core_initcall(membarrier_init);
static void ipi_mb(void *info)
{
@@ -199,7 +217,16 @@ static void ipi_rseq(void *info)
* is negligible.
*/
smp_mb();
- rseq_sched_switch_event(current);
+ /*
+ * Legacy mode requires that IDs are written and the critical section is
+ * evaluated. V2 optimized mode handles the critical section and IDs are
+ * only updated if they change as a consequence of preemption after
+ * return from this IPI.
+ */
+ if (rseq_v2(current))
+ rseq_sched_switch_event(current);
+ else
+ rseq_force_update();
}
static void ipi_sync_rq_state(void *info)
@@ -249,23 +276,19 @@ void membarrier_update_current_mm(struct mm_struct *next_mm)
static int membarrier_global_expedited(void)
{
+ cpumask_var_t __free(free_cpumask_var) tmpmask = CPUMASK_VAR_NULL;
int cpu;
- cpumask_var_t tmpmask;
if (num_online_cpus() == 1)
return 0;
- /*
- * Matches memory barriers after rq->curr modification in
- * scheduler.
- */
- smp_mb(); /* system call entry is not a mb. */
-
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ guard(mb)();
SERIALIZE_IPI();
- cpus_read_lock();
+ guard(cpus_read_lock)();
+
rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
@@ -301,21 +324,11 @@ static int membarrier_global_expedited(void)
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
preempt_enable();
- free_cpumask_var(tmpmask);
- cpus_read_unlock();
-
- /*
- * Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers before
- * rq->curr modification in scheduler.
- */
- smp_mb(); /* exit from system call is not a mb */
return 0;
}
static int membarrier_private_expedited(int flags, int cpu_id)
{
- cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm;
smp_call_func_t ipi_func = ipi_mb;
@@ -352,30 +365,45 @@ static int membarrier_private_expedited(int flags, int cpu_id)
* On RISC-V, this barrier pairing is also needed for the
* SYNC_CORE command when switching between processes, cf.
* the inline comments in membarrier_arch_switch_mm().
+ *
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers before
+ * rq->curr modification in scheduler.
*/
- smp_mb(); /* system call entry is not a mb. */
-
- if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
-
- SERIALIZE_IPI();
- cpus_read_lock();
-
+ guard(mb)();
if (cpu_id >= 0) {
+ if (cpu_id >= nr_cpu_ids || !cpu_possible(cpu_id))
+ return 0;
+
+ SERIALIZE_IPI_CPU(cpu_id);
+ guard(cpus_read_lock)();
struct task_struct *p;
- if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
- goto out;
+ if (!cpu_online(cpu_id))
+ return 0;
+
rcu_read_lock();
p = rcu_dereference(cpu_rq(cpu_id)->curr);
if (!p || p->mm != mm) {
rcu_read_unlock();
- goto out;
+ return 0;
}
rcu_read_unlock();
+ /*
+ * smp_call_function_single() will call ipi_func() if cpu_id
+ * is the calling CPU.
+ */
+ smp_call_function_single(cpu_id, ipi_func, NULL, 1);
} else {
+ cpumask_var_t __free(free_cpumask_var) tmpmask = CPUMASK_VAR_NULL;
int cpu;
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ SERIALIZE_IPI();
+ guard(cpus_read_lock)();
+
rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
@@ -385,15 +413,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
__cpumask_set_cpu(cpu, tmpmask);
}
rcu_read_unlock();
- }
-
- if (cpu_id >= 0) {
- /*
- * smp_call_function_single() will call ipi_func() if cpu_id
- * is the calling CPU.
- */
- smp_call_function_single(cpu_id, ipi_func, NULL, 1);
- } else {
/*
* For regular membarrier, we can save a few cycles by
* skipping the current cpu -- we're about to do smp_mb()
@@ -420,18 +439,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
}
}
-out:
- if (cpu_id < 0)
- free_cpumask_var(tmpmask);
- cpus_read_unlock();
-
- /*
- * Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers before
- * rq->curr modification in scheduler.
- */
- smp_mb(); /* exit from system call is not a mb */
-
return 0;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4ee8faf01441..e474c31d8fe6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -19,9 +19,9 @@ int sysctl_sched_rt_period = 1000000;
/*
* part of the period that we allow rt tasks to run in us.
- * default: 0.95s
+ * default: 1s
*/
-int sysctl_sched_rt_runtime = 950000;
+int sysctl_sched_rt_runtime = 1000000;
#ifdef CONFIG_SYSCTL
static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
@@ -1596,8 +1596,14 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}
-static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+static int balance_rt(struct rq *rq, struct rq_flags *rf)
{
+ /*
+ * Note, rq->donor may change during rq lock drops,
+ * so don't re-use p across lock drops
+ */
+ struct task_struct *p = rq->donor;
+
if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
/*
* This is OK, because current is on_cpu, which avoids it being
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9f63b15d309d..c7c2dea65edd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -421,6 +421,10 @@ extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
+extern int dl_server_attach_bw(struct sched_dl_entity *dl_se);
+extern void dl_server_detach_bw(struct sched_dl_entity *dl_se);
+extern int dl_server_swap_bw(struct sched_dl_entity *detach_se,
+ struct sched_dl_entity *attach_se);
static inline bool dl_server_active(struct sched_dl_entity *dl_se)
{
@@ -480,10 +484,8 @@ struct task_group {
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* schedulable entities of this group on each CPU */
- struct sched_entity **se;
/* runqueue "owned" by this group on each CPU */
- struct cfs_rq **cfs_rq;
+ struct cfs_rq __percpu *cfs_rq;
unsigned long shares;
/*
* load_avg can be heavily contended at clock tick time, so put
@@ -889,6 +891,7 @@ struct dl_rq {
bool overloaded;
+ struct sched_dl_entity *curr;
/*
* Tasks on this rq that can be pushed away. They are kept in
* an rb-tree, ordered by tasks' deadlines, with caching
@@ -929,7 +932,8 @@ struct dl_rq {
};
#ifdef CONFIG_FAIR_GROUP_SCHED
-
+/* Check whether a task group is root tg */
+#define is_root_task_group(tg) ((tg) == &root_task_group)
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
@@ -1187,6 +1191,12 @@ struct rq {
struct scx_rq scx;
struct sched_dl_entity ext_server;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ raw_spinlock_t cpu_epoch_lock ____cacheline_aligned;
+ u64 cpu_runtime;
+ unsigned long cpu_epoch;
+ unsigned long cpu_epoch_next;
+#endif
struct sched_dl_entity fair_server;
@@ -1199,6 +1209,12 @@ struct rq {
#ifdef CONFIG_NUMA_BALANCING
unsigned int numa_migrate_on;
#endif
+
+#ifdef CONFIG_SCHED_CACHE
+ unsigned int nr_pref_llc_running;
+ unsigned int nr_llc_running;
+#endif
+
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
@@ -1546,6 +1562,14 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void);
extern void sched_core_put(void);
+static inline bool task_has_sched_core(struct task_struct *p)
+{
+ if (sched_core_disabled())
+ return false;
+
+ return !!p->core_cookie;
+}
+
#else /* !CONFIG_SCHED_CORE: */
static inline bool sched_core_enabled(struct rq *rq)
@@ -1586,6 +1610,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
return true;
}
+static inline bool task_has_sched_core(struct task_struct *p)
+{
+ return false;
+}
+
#endif /* !CONFIG_SCHED_CORE */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1667,21 +1696,15 @@ do { \
flags = _raw_spin_rq_lock_irqsave(rq); \
} while (0)
-#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
static inline void update_idle_core(struct rq *rq)
{
- if (static_branch_unlikely(&sched_smt_present))
+ if (sched_smt_active())
__update_idle_core(rq);
}
-#else /* !CONFIG_SCHED_SMT: */
-static inline void update_idle_core(struct rq *rq) { }
-#endif /* !CONFIG_SCHED_SMT */
-
#ifdef CONFIG_FAIR_GROUP_SCHED
-
static inline struct task_struct *task_of(struct sched_entity *se)
{
WARN_ON_ONCE(!entity_is_task(se));
@@ -2082,6 +2105,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p)
#endif /* !CONFIG_NUMA_BALANCING */
+int task_llc(const struct task_struct *p);
+
static inline void
queue_balance_callback(struct rq *rq,
struct balance_callback *head,
@@ -2171,6 +2196,7 @@ DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(int, sd_share_id);
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
@@ -2267,6 +2293,46 @@ static inline struct task_group *task_group(struct task_struct *p)
return p->sched_task_group;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Defined here to be available before stats.h is included, since
+ * stats.h has dependencies on things defined later in this file.
+ */
+struct cfs_tg_state {
+ struct cfs_rq cfs_rq;
+ struct sched_entity se;
+ struct sched_statistics stats;
+} __no_randomize_layout;
+
+/* Access a specific CPU's cfs_rq from a task group */
+static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
+{
+ return per_cpu_ptr(tg->cfs_rq, cpu);
+}
+
+static inline struct sched_entity *tg_se(struct task_group *tg, int cpu)
+{
+ struct cfs_tg_state *state;
+
+ if (is_root_task_group(tg))
+ return NULL;
+
+ state = container_of(tg_cfs_rq(tg, cpu), struct cfs_tg_state, cfs_rq);
+ return &state->se;
+}
+
+static inline struct sched_entity *cfs_rq_se(struct cfs_rq *cfs_rq)
+{
+ struct cfs_tg_state *state;
+
+ if (is_root_task_group(cfs_rq->tg))
+ return NULL;
+
+ state = container_of(cfs_rq, struct cfs_tg_state, cfs_rq);
+ return &state->se;
+}
+#endif
+
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
@@ -2275,10 +2341,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
- set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
- p->se.cfs_rq = tg->cfs_rq[cpu];
- p->se.parent = tg->se[cpu];
- p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0;
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg_cfs_rq(tg, cpu));
+ p->se.cfs_rq = tg_cfs_rq(tg, cpu);
+ p->se.parent = tg_se(tg, cpu);
+ p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -2561,23 +2627,12 @@ struct sched_class {
/*
* schedule/pick_next_task/prev_balance: rq->lock
*/
- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ int (*balance)(struct rq *rq, struct rq_flags *rf);
/*
* schedule/pick_next_task: rq->lock
*/
struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
- /*
- * Optional! When implemented pick_next_task() should be equivalent to:
- *
- * next = pick_task();
- * if (next) {
- * put_prev_task(prev);
- * set_next_task_first(next);
- * }
- */
- struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf);
/*
* sched_change:
@@ -2801,8 +2856,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_queued > 0;
}
-extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf);
+extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf);
extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
#define SCA_CHECK 0x01
@@ -4037,6 +4091,29 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
+#ifdef CONFIG_SCHED_CACHE
+DECLARE_STATIC_KEY_FALSE(sched_cache_present);
+DECLARE_STATIC_KEY_FALSE(sched_cache_active);
+extern int sysctl_sched_cache_user;
+extern unsigned int llc_aggr_tolerance;
+extern unsigned int llc_epoch_period;
+extern unsigned int llc_epoch_affinity_timeout;
+extern unsigned int llc_imb_pct;
+extern unsigned int llc_overaggr_pct;
+
+static inline bool sched_cache_enabled(void)
+{
+ return static_branch_unlikely(&sched_cache_active);
+}
+
+extern void sched_cache_active_set(void);
+
+#endif
+
+void sched_domains_free_llc_id(int cpu);
+
+extern void init_sched_mm(struct task_struct *p);
+
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
static inline
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index a612cf253c87..ebe0a7765f98 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -89,19 +89,12 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
#endif /* CONFIG_SCHEDSTATS */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-struct sched_entity_stats {
- struct sched_entity se;
- struct sched_statistics stats;
-} __no_randomize_layout;
-#endif
-
static inline struct sched_statistics *
__schedstats_from_se(struct sched_entity *se)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!entity_is_task(se))
- return &container_of(se, struct sched_entity_stats, se)->stats;
+ return &container_of(se, struct cfs_tg_state, se)->stats;
#endif
return &task_of(se)->stats;
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index f95798baddeb..c909ca0d8c87 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -16,7 +16,7 @@ select_task_rq_stop(struct task_struct *p, int cpu, int flags)
}
static int
-balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+balance_stop(struct rq *rq, struct rq_flags *rf)
{
return sched_stop_runnable(rq);
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5847b83d9d55..622e2e01974c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -19,8 +19,10 @@ void sched_domains_mutex_unlock(void)
}
/* Protected by sched_domains_mutex: */
+static cpumask_var_t sched_domains_llc_id_allocmask;
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
+int max_lid;
static int __init sched_debug_setup(char *str)
{
@@ -621,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
} while (sg != first);
}
+static void free_sched_domain_shared(struct sched_domain_shared *sds)
+{
+ if (sds && atomic_dec_and_test(&sds->ref))
+ kfree(sds);
+}
+
static void destroy_sched_domain(struct sched_domain *sd)
{
/*
@@ -629,9 +637,12 @@ static void destroy_sched_domain(struct sched_domain *sd)
* dropping group/capacity references, freeing where none remain.
*/
free_sched_groups(sd->groups, 1);
+ free_sched_domain_shared(sd->shared);
- if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
- kfree(sd->shared);
+#ifdef CONFIG_SCHED_CACHE
+ /* only the bottom sd has llc_counts array */
+ kfree(sd->llc_counts);
+#endif
kfree(sd);
}
@@ -663,9 +674,10 @@ static void destroy_sched_domains(struct sched_domain *sd)
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_llc_id) = -1;
DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
@@ -692,7 +704,6 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -713,7 +724,18 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+ /*
+ * The shared object is attached to sd_asym_cpucapacity only when the
+ * asym domain is non-overlapping (i.e., not built from SD_NUMA).
+ * On overlapping (NUMA) asym domains we fall back to letting the
+ * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
+ * here.
+ */
+ if (sd && sd->shared)
+ sds = sd->shared;
+
rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+ rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
}
/*
@@ -737,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
/* Pick reference to parent->shared. */
if (parent->shared) {
- WARN_ON_ONCE(tmp->shared);
+ /*
+ * It is safe to free a sd->shared that
+ * has not been published yet. If a
+ * sd->shared was published, the refcount
+ * will end up being non-zero and it will
+ * not be freed here.
+ */
+ free_sched_domain_shared(tmp->shared);
tmp->shared = parent->shared;
parent->shared = NULL;
}
@@ -762,10 +791,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd && sd_degenerate(sd)) {
tmp = sd;
sd = sd->parent;
- destroy_sched_domain(tmp);
+
if (sd) {
struct sched_group *sg = sd->groups;
+#ifdef CONFIG_SCHED_CACHE
+ /* move buffer to parent as child is being destroyed */
+ sd->llc_counts = tmp->llc_counts;
+ sd->llc_max = tmp->llc_max;
+ sd->llc_bytes = tmp->llc_bytes;
+ /* make sure destroy_sched_domain() does not free it */
+ tmp->llc_counts = NULL;
+ tmp->llc_max = 0;
+ tmp->llc_bytes = 0;
+#endif
/*
* sched groups hold the flags of the child sched
* domain for convenience. Clear such flags since
@@ -777,6 +816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sd->child = NULL;
}
+
+ destroy_sched_domain(tmp);
}
sched_domain_debug(sd, cpu);
@@ -804,6 +845,239 @@ enum s_alloc {
sa_none,
};
+#ifdef CONFIG_SCHED_CACHE
+/* hardware support for cache aware scheduling */
+DEFINE_STATIC_KEY_FALSE(sched_cache_present);
+/*
+ * Indicator of whether cache aware scheduling
+ * is active, used by the scheduler.
+ */
+DEFINE_STATIC_KEY_FALSE(sched_cache_active);
+/* user wants cache aware scheduling [0 or 1] */
+int sysctl_sched_cache_user = 1;
+
+/*
+ * Get the effective LLC size in bytes that @cpu's bottom sched_domain
+ * can use. A CPU within a cpuset partition can only use a proportion
+ * of the physical LLC, scaled by the ratio of the partition's span
+ * weight to the hardware LLC sharing weight. @sd should be the
+ * topmost domain with SD_SHARE_LLC.
+ *
+ * Returns 0 if cacheinfo is not yet populated. This happens during
+ * early boot when build_sched_domains() runs before the generic
+ * cacheinfo framework has been initialized (cacheinfo_cpu_online()
+ * is a device_initcall cpuhp callback). In that case,
+ * cacheinfo_cpu_online() will later call sched_update_llc_bytes()
+ * to fill in the bottom domain's llc_bytes once the cache attributes
+ * are available.
+ */
+static unsigned long get_effective_llc_bytes(int cpu,
+ struct sched_domain *sd)
+{
+ struct cacheinfo *ci;
+ unsigned int hw_weight;
+
+ ci = get_cpu_cacheinfo_llc(cpu);
+ if (!ci)
+ return 0;
+
+ hw_weight = cpumask_weight(&ci->shared_cpu_map);
+ if (!hw_weight)
+ return 0;
+
+ return div_u64((u64)ci->size * sd->span_weight, hw_weight);
+}
+
+static bool alloc_sd_llc(const struct cpumask *cpu_map,
+ struct s_data *d)
+{
+ struct sched_domain *sd, *top_llc, *parent;
+ unsigned int *p;
+ int i;
+
+ for_each_cpu(i, cpu_map) {
+ sd = *per_cpu_ptr(d->sd, i);
+ if (!sd)
+ goto err;
+
+ p = kcalloc_node(max_lid + 1, sizeof(unsigned int),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!p)
+ goto err;
+
+ top_llc = sd;
+ /*
+ * Find the topmost SD_SHARE_LLC domain.
+ * Not yet attached to the CPU, so per_cpu(sd_llc, i)
+ * can not be used.
+ */
+ while ((parent = rcu_dereference_protected(top_llc->parent, true)) &&
+ (parent->flags & SD_SHARE_LLC))
+ top_llc = parent;
+
+ if (top_llc->flags & SD_SHARE_LLC) {
+ sd->llc_max = max_lid + 1;
+ sd->llc_counts = p;
+ sd->llc_bytes = get_effective_llc_bytes(i, top_llc);
+ } else {
+ /* avoid memory leak */
+ kfree(p);
+ }
+ }
+
+ return true;
+err:
+ for_each_cpu(i, cpu_map) {
+ sd = *per_cpu_ptr(d->sd, i);
+ if (sd) {
+ kfree(sd->llc_counts);
+ sd->llc_counts = NULL;
+ sd->llc_max = 0;
+ sd->llc_bytes = 0;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * Enable/disable cache aware scheduling according to
+ * user input and the presence of hardware support.
+ */
+static void _sched_cache_active_set(void)
+{
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&sched_domains_mutex);
+
+ /* hardware does not support */
+ if (!static_branch_likely(&sched_cache_present)) {
+ static_branch_disable_cpuslocked(&sched_cache_active);
+ if (sched_debug())
+ pr_info("%s: cache aware scheduling not supported on this platform\n", __func__);
+ return;
+ }
+
+ /*
+ * user wants it or not ?
+ * TBD: read before writing the static key.
+ * It is not in the critical path, leave as-is
+ * for now.
+ */
+ if (sysctl_sched_cache_user) {
+ static_branch_enable_cpuslocked(&sched_cache_active);
+ if (sched_debug())
+ pr_info("%s: enabling cache aware scheduling\n", __func__);
+ } else {
+ static_branch_disable_cpuslocked(&sched_cache_active);
+ if (sched_debug())
+ pr_info("%s: disabling cache aware scheduling\n", __func__);
+ }
+}
+
+/* used by debugfs */
+void sched_cache_active_set(void)
+{
+ cpus_read_lock();
+ sched_domains_mutex_lock();
+ _sched_cache_active_set();
+ sched_domains_mutex_unlock();
+ cpus_read_unlock();
+}
+
+/*
+ * Update the bottom sched_domain's llc_bytes for @cpu and all its
+ * LLC siblings. Called from cacheinfo_cpu_online() or
+ * cacheinfo_cpu_pre_down() with cpu hotplug lock held.
+ *
+ * Note: get_effective_llc_bytes() returns 0 on PowerPC.
+ * thus cache aware scheduling is disabled on PowerPC for
+ * now. PowerPC does not use the generic cacheinfo framework --
+ * it has its own cacheinfo with a separate struct cache hierarchy
+ * and does not populates the per-CPU struct cpu_cacheinfo array
+ * that get_cpu_cacheinfo_llc() reads.
+ */
+void sched_update_llc_bytes(unsigned int cpu)
+{
+ struct sched_domain *sd, *sdp;
+ unsigned int i;
+
+ sched_domains_mutex_lock();
+
+ sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu));
+ if (!sdp)
+ goto unlock;
+
+ /*
+ * ci->shared_cpu_map is built incrementally as CPUs come
+ * online, so the first CPU in an LLC initially sees
+ * hw_weight == 1 and computes an inflated llc_bytes in
+ * get_effective_llc_bytes(). Re-evaluating every LLC
+ * sibling on each online event corrects this once the full
+ * shared_cpu_map is known.
+ */
+ for_each_cpu(i, sched_domain_span(sdp)) {
+ sd = rcu_dereference_sched_domain(cpu_rq(i)->sd);
+ if (sd)
+ sd->llc_bytes = get_effective_llc_bytes(i, sdp);
+ }
+
+unlock:
+ sched_domains_mutex_unlock();
+}
+
+static void sched_cache_set(bool has_multi_llcs)
+{
+ /*
+ * TBD: check before writing to it. sched domain rebuild
+ * is not in the critical path, leave as-is for now.
+ */
+ if (has_multi_llcs)
+ static_branch_enable_cpuslocked(&sched_cache_present);
+ else
+ static_branch_disable_cpuslocked(&sched_cache_present);
+
+ _sched_cache_active_set();
+}
+#else
+static bool alloc_sd_llc(const struct cpumask *cpu_map,
+ struct s_data *d)
+{
+ return false;
+}
+static inline void sched_cache_set(bool has_multi_llcs) { }
+#endif
+
+/*
+ * Return true if @sd belongs to an LLC group whose enclosing
+ * partition spans more than one LLC. @sd must be the topmost
+ * SD_SHARE_LLC domain.
+ *
+ * Any duplicated parent domains with the same span as @sd are
+ * skipped: before cpu_attach_domain() degeneration these still
+ * exist, after degeneration the loop is a no-op. This makes the
+ * helper usable both during sched domain build and against an
+ * already-attached domain tree.
+ *
+ * Note: For systems with a single LLC per node, cache-aware
+ * scheduling is still enabled when multiple nodes exist.
+ * However, NUMA balancing decisions take precedence over
+ * cache-aware scheduling. Conversely, if there is only one
+ * LLC per partition, cache-aware scheduling should be disabled.
+ */
+static bool sd_in_multi_llcs(struct sched_domain *sd)
+{
+ struct sched_domain *sdp = sd->parent;
+
+ /* it does not make sense to aggregate to 1 CPU */
+ if (sd->span_weight == 1)
+ return false;
+
+ while (sdp && sdp->span_weight == sd->span_weight)
+ sdp = sdp->parent;
+
+ return !!sdp;
+}
+
/*
* Return the canonical balance CPU for this group, this is the first CPU
* of this group that's also in the balance mask.
@@ -1310,9 +1584,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
cpumask_copy(mask, sched_group_span(sg));
for_each_cpu(cpu, mask) {
cores++;
-#ifdef CONFIG_SCHED_SMT
cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
-#endif
}
sg->cores = cores;
@@ -1790,8 +2062,22 @@ const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu
{
return cpu_coregroup_mask(cpu);
}
+
+/*
+ * Majority of architectures have LLC at MC domain level with exception
+ * such as powerpc. Provide a way for arch to specify where its LLC is
+ * if it falls in exception category
+ */
+# ifndef arch_llc_mask
+#define arch_llc_mask(cpu) cpu_coregroup_mask(cpu)
+# endif
+
+#else
+#define arch_llc_mask(cpu) cpumask_of(cpu)
#endif
+#define llc_mask(cpu) arch_llc_mask(cpu)
+
const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
{
return cpu_node_mask(cpu);
@@ -2650,14 +2936,153 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
}
}
+static void
+init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags)
+{
+ struct sched_domain_shared *sds = NULL;
+ int cpu;
+
+ /*
+ * Multiple domains can try to claim a shared object like
+ * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to
+ * same cpumask_first(sched_domain_span(sd)) CPU and can
+ * cause "nr_idle_scan" to be populated incorrectly during
+ * load balancing.
+ *
+ * Find the first CPU in sched_domain_span(sd) with an
+ * unclaimed domain (!alloc_flags) or where the alloc_flag
+ * matches the requested flag (SD_* flag)
+ *
+ * If the domain only has single CPU, allow temporary overlap
+ * in allocation since the domains will be degenerated later.
+ */
+ for_each_cpu(cpu, sched_domain_span(sd)) {
+ sds = *per_cpu_ptr(d->sds, cpu);
+
+ if (!sds->alloc_flags ||
+ sd->span_weight == 1 ||
+ sds->alloc_flags == flags) {
+ sds->alloc_flags = flags;
+ sd->shared = sds;
+ break;
+ }
+ }
+
+ /*
+ * Use the sd_shared corresponding to the last
+ * CPU in the span if none are avaialable.
+ */
+ if (WARN_ON_ONCE(!sd->shared))
+ sd->shared = sds;
+
+ /*
+ * nr_busy_cpus is consumed only by the NOHZ kick path via
+ * sd_balance_shared; on the asym-capacity path it is initialized but
+ * never read.
+ */
+ atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+ atomic_inc(&sd->shared->ref);
+}
+
+/*
+ * For asymmetric CPU capacity, attach sched_domain_shared on the innermost
+ * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is
+ * not an overlapping NUMA-built domain (then LLC should claim shared).
+ *
+ * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island),
+ * then LLC must claim shared instead.
+ *
+ * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values
+ * are present in the domain span, so the asym domain we attach to cannot
+ * degenerate into a single-capacity group. The relevant edge cases are instead
+ * covered by the caveats above.
+ *
+ * Return true if this CPU's asym path claimed sd->shared, false otherwise.
+ */
+static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
+{
+ struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu);
+ struct sched_domain *sd_asym;
+
+ if (!sd)
+ return false;
+
+ sd_asym = sd;
+ while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
+ sd_asym = sd_asym->parent;
+
+ if (!sd_asym || (sd_asym->flags & SD_NUMA))
+ return false;
+
+ init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY);
+ return true;
+}
+
+static int __sched_domains_alloc_llc_id(void)
+{
+ int lid, max;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
+ /*
+ * llc_id space should never grow larger than the
+ * possible number of CPUs in the system.
+ */
+ if (lid >= nr_cpu_ids)
+ return -1;
+
+ __cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
+ max = cpumask_last(sched_domains_llc_id_allocmask);
+ if (max > max_lid)
+ max_lid = max;
+
+ return lid;
+}
+
+static void __sched_domains_free_llc_id(int cpu)
+{
+ int i, lid, max;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = per_cpu(sd_llc_id, cpu);
+ if (lid == -1 || lid >= nr_cpu_ids)
+ return;
+
+ per_cpu(sd_llc_id, cpu) = -1;
+
+ for_each_cpu(i, llc_mask(cpu)) {
+ /* An online CPU owns the llc_id. */
+ if (per_cpu(sd_llc_id, i) == lid)
+ return;
+ }
+
+ __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
+
+ max = cpumask_last(sched_domains_llc_id_allocmask);
+ /* shrink max lid to save memory */
+ if (max < max_lid)
+ max_lid = max;
+}
+
+void sched_domains_free_llc_id(int cpu)
+{
+ sched_domains_mutex_lock();
+ __sched_domains_free_llc_id(cpu);
+ sched_domains_mutex_unlock();
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
static int
-build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ bool *multi_llcs)
{
enum s_alloc alloc_state = sa_none;
+ bool has_multi_llcs = false;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
@@ -2675,6 +3100,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
+ int lid;
sd = NULL;
for_each_sd_topology(tl) {
@@ -2688,6 +3114,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
+
+ lid = per_cpu(sd_llc_id, i);
+ if (lid == -1) {
+ /* try to reuse the llc_id of its siblings */
+ for (int j = cpumask_first(llc_mask(i));
+ j < nr_cpu_ids;
+ j = cpumask_next(j, llc_mask(i))) {
+ if (i == j)
+ continue;
+
+ lid = per_cpu(sd_llc_id, j);
+
+ if (lid != -1) {
+ per_cpu(sd_llc_id, i) = lid;
+
+ break;
+ }
+ }
+
+ /* a new LLC is detected */
+ if (lid == -1)
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
+ }
}
if (WARN_ON(!topology_span_sane(cpu_map)))
@@ -2712,23 +3161,27 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (!sd)
continue;
+ if (has_asym)
+ claim_asym_sched_domain_shared(&d, i);
+
/* First, find the topmost SD_SHARE_LLC domain */
while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
sd = sd->parent;
if (sd->flags & SD_SHARE_LLC) {
- int sd_id = cpumask_first(sched_domain_span(sd));
-
- sd->shared = *per_cpu_ptr(d.sds, sd_id);
- atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
- atomic_inc(&sd->shared->ref);
+ init_sched_domain_shared(&d, sd, SD_SHARE_LLC);
/*
* In presence of higher domains, adjust the
* NUMA imbalance stats for the hierarchy.
*/
- if (IS_ENABLED(CONFIG_NUMA) && sd->parent)
- adjust_numa_imbalance(sd);
+ if (sd->parent) {
+ if (IS_ENABLED(CONFIG_NUMA))
+ adjust_numa_imbalance(sd);
+
+ if (sd_in_multi_llcs(sd))
+ has_multi_llcs = true;
+ }
}
}
@@ -2743,6 +3196,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
init_sched_groups_capacity(i, sd);
}
+ alloc_sd_llc(cpu_map, &d);
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
@@ -2767,6 +3222,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
ret = 0;
error:
+ *multi_llcs = has_multi_llcs;
__free_domain_allocs(&d, alloc_state, cpu_map);
return ret;
@@ -2829,8 +3285,10 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
*/
int __init sched_init_domains(const struct cpumask *cpu_map)
{
+ bool multi_llcs;
int err;
+ zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -2842,7 +3300,9 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
- err = build_sched_domains(doms_cur[0], NULL);
+ err = build_sched_domains(doms_cur[0], NULL, &multi_llcs);
+ if (!err)
+ sched_cache_set(multi_llcs);
return err;
}
@@ -2915,6 +3375,7 @@ static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new
struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
+ bool has_multi_llcs = false, multi_llcs;
int i, j, n;
int new_topology;
@@ -2964,14 +3425,41 @@ match1:
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j]) &&
- dattrs_equal(dattr_new, i, dattr_cur, j))
+ dattrs_equal(dattr_new, i, dattr_cur, j)) {
+ /*
+ * Reused partition has to be taken care
+ * of here, because there could be a corner
+ * case that if the reused partition is skipped
+ * and only new partition is considered, an
+ * incorrect has_multi_llcs would be set. For
+ * example:
+ * If the only multi-LLC partition is reused
+ * and a new single-LLC partition is built,
+ * sched_cache_set(false) disables cache-aware
+ * scheduling globally despite the reused
+ * multi-LLC partition still being active.
+ */
+ struct sched_domain *sd;
+ int cpu = cpumask_first(doms_cur[j]);
+
+ guard(rcu)();
+ sd = rcu_dereference(cpu_rq(cpu)->sd);
+ while (sd && sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+ sd = sd->parent;
+ if (sd && (sd->flags & SD_SHARE_LLC) && sd->parent &&
+ sd_in_multi_llcs(sd))
+ has_multi_llcs = true;
goto match2;
+ }
}
/* No match - add a new doms_new */
- build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL,
+ &multi_llcs);
+ has_multi_llcs |= multi_llcs;
match2:
;
}
+ sched_cache_set(has_multi_llcs);
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
/* Build perf domains: */
diff --git a/kernel/signal.c b/kernel/signal.c
index 2d102e025883..9c2b32c4d755 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1338,6 +1338,7 @@ int zap_other_threads(struct task_struct *p)
int count = 0;
p->signal->group_stop_count = 0;
+ task_clear_jobctl_pending(p, JOBCTL_PENDING_MASK);
for_other_threads(p, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 3fe6b0c99f3d..773d8e9ae30c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -633,6 +633,11 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
EXPORT_SYMBOL_GPL(stop_machine);
#ifdef CONFIG_SCHED_SMT
+/*
+ * INTEL_IFS is the only user of this API. That selftest can
+ * only be compiled if SMP=y. On x86 it selects SCHED_SMT.
+ * Keep the ifdefs for now.
+ */
int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
diff --git a/kernel/sys.c b/kernel/sys.c
index 62e842055cc9..df69bd71de03 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2565,14 +2565,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = put_user(me->pdeath_signal, (int __user *)arg2);
break;
case PR_GET_DUMPABLE:
- error = get_dumpable(me->mm);
+ error = task_exec_state_get_dumpable(me);
break;
case PR_SET_DUMPABLE:
- if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+ if (arg2 != TASK_DUMPABLE_OFF && arg2 != TASK_DUMPABLE_OWNER) {
error = -EINVAL;
break;
}
- set_dumpable(me->mm, arg2);
+ task_exec_state_set_dumpable(arg2);
break;
case PR_SET_UNALIGN:
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 02aac7c5aa76..d098ac39bde4 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -16,10 +16,6 @@ config ARCH_CLOCKSOURCE_INIT
config ARCH_WANTS_CLOCKSOURCE_READ_INLINE
bool
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL
- bool
-
# The generic clock events infrastructure
config GENERIC_CLOCKEVENTS
def_bool !LEGACY_TIMER_TICK
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 6e173d70d825..ea5be5870e51 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -337,48 +337,32 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
EXPORT_SYMBOL_GPL(alarm_init);
/**
- * alarm_start - Sets an absolute alarm to fire
- * @alarm: ptr to alarm to set
- * @start: time to run the alarm
+ * alarm_start_timer - Sets an alarm to fire
+ * @alarm: Pointer to alarm to set
+ * @expires: Expiry time
+ * @relative: True if @expires is relative
+ *
+ * Returns: True if the alarm was queued. False if it already expired
*/
-void alarm_start(struct alarm *alarm, ktime_t start)
+bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- scoped_guard(spinlock_irqsave, &base->lock) {
- alarm->node.expires = start;
- alarmtimer_enqueue(base, alarm);
- hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
- }
+ if (relative)
+ expires = ktime_add_safe(expires, base->get_ktime());
trace_alarmtimer_start(alarm, base->get_ktime());
-}
-EXPORT_SYMBOL_GPL(alarm_start);
-
-/**
- * alarm_start_relative - Sets a relative alarm to fire
- * @alarm: ptr to alarm to set
- * @start: time relative to now to run the alarm
- */
-void alarm_start_relative(struct alarm *alarm, ktime_t start)
-{
- struct alarm_base *base = &alarm_bases[alarm->type];
-
- start = ktime_add_safe(start, base->get_ktime());
- alarm_start(alarm, start);
-}
-EXPORT_SYMBOL_GPL(alarm_start_relative);
-
-void alarm_restart(struct alarm *alarm)
-{
- struct alarm_base *base = &alarm_bases[alarm->type];
guard(spinlock_irqsave)(&base->lock);
- hrtimer_set_expires(&alarm->timer, alarm->node.expires);
- hrtimer_restart(&alarm->timer);
+ alarm->node.expires = expires;
alarmtimer_enqueue(base, alarm);
+ if (!hrtimer_start_range_ns_user(&alarm->timer, expires, 0, HRTIMER_MODE_ABS)) {
+ alarmtimer_dequeue(base, alarm);
+ return false;
+ }
+ return true;
}
-EXPORT_SYMBOL_GPL(alarm_restart);
+EXPORT_SYMBOL_GPL(alarm_start_timer);
/**
* alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -512,8 +496,6 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
* @now: time at the timer expiration
*
* Posix timer callback for expired alarm timers.
- *
- * Return: whether the timer is to be restarted
*/
static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
{
@@ -527,12 +509,12 @@ static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
* alarm_timer_rearm - Posix timer callback for rearming timer
* @timr: Pointer to the posixtimer data struct
*/
-static void alarm_timer_rearm(struct k_itimer *timr)
+static bool alarm_timer_rearm(struct k_itimer *timr)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
timr->it_overrun += alarm_forward_now(alarm, timr->it_interval);
- alarm_start(alarm, alarm->node.expires);
+ return alarm_start_timer(alarm, alarm->node.expires, false);
}
/**
@@ -588,7 +570,7 @@ static void alarm_timer_wait_running(struct k_itimer *timr)
* @absolute: Expiry value is absolute time
* @sigev_none: Posix timer does not deliver signals
*/
-static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
+static bool alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
@@ -596,10 +578,16 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
if (!absolute)
expires = ktime_add_safe(expires, base->get_ktime());
- if (sigev_none)
+
+ /*
+ * sigev_none needs to update the expires value and pretend
+ * that the timer is queued
+ */
+ if (sigev_none) {
alarm->node.expires = expires;
- else
- alarm_start(&timr->it.alarm.alarmtimer, expires);
+ return true;
+ }
+ return alarm_start_timer(&timr->it.alarm.alarmtimer, expires, false);
}
/**
@@ -706,7 +694,9 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
alarm->data = (void *)current;
do {
set_current_state(TASK_INTERRUPTIBLE);
- alarm_start(alarm, absexp);
+ if (!alarm_start_timer(alarm, absexp, false))
+ alarm->data = NULL;
+
if (likely(alarm->data))
schedule();
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5e22697b098d..0014d163f989 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -301,7 +301,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
#include <asm/clock_inlined.h>
#else
static __always_inline void
-arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { }
+arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *dev) { }
#endif
static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index baee13a1f87f..e48c4d379a7c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1222,14 +1222,8 @@ static void clocksource_enqueue(struct clocksource *cs)
* @cs: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
- *
- * This should only be called from the clocksource->enable() method.
- *
- * This *SHOULD NOT* be called directly! Please use the
- * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
- * functions.
*/
-void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
+static void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
u64 sec;
@@ -1287,7 +1281,6 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
}
-EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
/**
* __clocksource_register_scale - Used to install new clocksources
@@ -1338,6 +1331,26 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
+static void __devm_clocksource_unregister(void *data)
+{
+ struct clocksource *cs = data;
+
+ clocksource_unregister(cs);
+}
+
+int __devm_clocksource_register_scale(struct device *dev, struct clocksource *cs,
+ u32 scale, u32 freq)
+{
+ int ret;
+
+ ret = __clocksource_register_scale(cs, scale, freq);
+ if (ret)
+ return ret;
+
+ return devm_add_action_or_reset(dev, __devm_clocksource_unregister, cs);
+}
+EXPORT_SYMBOL_GPL(__devm_clocksource_register_scale);
+
/*
* Unbind clocksource @cs. Called with clocksource_mutex held
*/
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5bd6efe598f0..638ce623c342 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1352,8 +1352,14 @@ static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool
return hrtimer_prefer_local(is_local, is_first, is_pinned);
}
-static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
- const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
+enum {
+ HRTIMER_REPROGRAM_NONE,
+ HRTIMER_REPROGRAM,
+ HRTIMER_REPROGRAM_FORCE,
+};
+
+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
{
struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
bool is_pinned, first, was_first, keep_base = false;
@@ -1410,7 +1416,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
/* If a deferred rearm is pending skip reprogramming the device */
if (cpu_base->deferred_rearm) {
cpu_base->deferred_needs_update = true;
- return false;
+ return HRTIMER_REPROGRAM_NONE;
}
if (!was_first || cpu_base != this_cpu_base) {
@@ -1423,7 +1429,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
* callbacks.
*/
if (likely(hrtimer_base_is_online(this_cpu_base)))
- return first;
+ return first ? HRTIMER_REPROGRAM : HRTIMER_REPROGRAM_NONE;
/*
* Timer was enqueued remote because the current base is
@@ -1432,7 +1438,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
*/
if (first)
smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
- return false;
+ return HRTIMER_REPROGRAM_NONE;
}
/*
@@ -1446,7 +1452,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
*/
if (timer->is_lazy) {
if (cpu_base->expires_next <= hrtimer_get_expires(timer))
- return false;
+ return HRTIMER_REPROGRAM_NONE;
}
/*
@@ -1455,8 +1461,24 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
* reprogram the hardware by evaluating the new first expiring
* timer.
*/
- hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
- return false;
+ return HRTIMER_REPROGRAM_FORCE;
+}
+
+static int hrtimer_start_range_ns_common(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode,
+ struct hrtimer_clock_base *base)
+{
+ /*
+ * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
+ * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
+ * expiry mode because unmarked timers are moved to softirq expiry.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+ else
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
+
+ return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, base);
}
/**
@@ -1476,24 +1498,104 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
debug_hrtimer_assert_init(timer);
+ base = lock_hrtimer_base(timer, &flags);
+
+ switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
+ case HRTIMER_REPROGRAM:
+ hrtimer_reprogram(timer, true);
+ break;
+ case HRTIMER_REPROGRAM_FORCE:
+ hrtimer_force_reprogram(timer->base->cpu_base, 1);
+ break;
+ case HRTIMER_REPROGRAM_NONE:
+ break;
+ }
+
+ unlock_hrtimer_base(timer, &flags);
+}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+static inline bool hrtimer_check_user_timer(struct hrtimer *timer)
+{
+ struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
+ ktime_t expires;
+
/*
- * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
- * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
- * expiry mode because unmarked timers are moved to softirq expiry.
+ * This uses soft expires because that's the user provided
+ * expiry time, while expires can be further in the past
+ * due to a slack value added to the user expiry time.
*/
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
- WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
- else
- WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
+ expires = hrtimer_get_softexpires(timer);
+
+ /* Convert to monotonic */
+ expires = ktime_sub(expires, timer->base->offset);
+
+ /*
+ * Check whether this timer will end up as the first expiring timer in
+ * the CPU base. If not, no further checks required as it's then
+ * guaranteed to expire in the future.
+ */
+ if (expires >= cpu_base->expires_next)
+ return true;
+
+ /* Validate that the expiry time is in the future. */
+ if (expires > ktime_get())
+ return true;
+
+ debug_hrtimer_deactivate(timer);
+ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_INACTIVE, false);
+ trace_hrtimer_start_expired(timer);
+ return false;
+}
+
+/**
+ * hrtimer_start_range_ns_user - (re)start an user controlled hrtimer
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
+ * softirq based mode is considered for debug purpose only!
+ *
+ * Returns: True when the timer was queued, false if it was already expired
+ *
+ * This function cannot invoke the timer callback for expired timers as it might
+ * be called under a lock which the timer callback needs to acquire. So the
+ * caller has to handle that case.
+ */
+bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode)
+{
+ struct hrtimer_clock_base *base;
+ unsigned long flags;
+ bool ret = true;
+
+ debug_hrtimer_assert_init(timer);
base = lock_hrtimer_base(timer, &flags);
- if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
- hrtimer_reprogram(timer, true);
+ switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
+ case HRTIMER_REPROGRAM:
+ ret = hrtimer_check_user_timer(timer);
+ if (ret)
+ hrtimer_reprogram(timer, true);
+ break;
+ case HRTIMER_REPROGRAM_FORCE:
+ ret = hrtimer_check_user_timer(timer);
+ /*
+ * The base must always be reevaluated, independent of the
+ * result above because the timer was the first pending timer.
+ */
+ hrtimer_force_reprogram(timer->base->cpu_base, 1);
+ break;
+ case HRTIMER_REPROGRAM_NONE:
+ break;
+ }
unlock_hrtimer_base(timer, &flags);
+ return ret;
}
-EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns_user);
/**
* hrtimer_try_to_cancel - try to deactivate a timer
@@ -1681,10 +1783,10 @@ EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
*
* Returns the next expiry time or KTIME_MAX if no timer is pending.
*/
-u64 hrtimer_get_next_event(void)
+ktime_t hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- u64 expires = KTIME_MAX;
+ ktime_t expires = KTIME_MAX;
guard(raw_spinlock_irqsave)(&cpu_base->lock);
if (!hrtimer_hres_active(cpu_base))
@@ -1700,10 +1802,10 @@ u64 hrtimer_get_next_event(void)
* Returns the next expiry time over all timers except for the @exclude one or
* KTIME_MAX if none of them is pending.
*/
-u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+ktime_t hrtimer_next_event_without(const struct hrtimer *exclude)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- u64 expires = KTIME_MAX;
+ ktime_t expires = KTIME_MAX;
unsigned int active;
guard(raw_spinlock_irqsave)(&cpu_base->lock);
@@ -2213,7 +2315,11 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode
if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
mode |= HRTIMER_MODE_HARD;
- hrtimer_start_expires(&sl->timer, mode);
+ /* If already expired, clear the task pointer and set current state to running */
+ if (!hrtimer_start_expires_user(&sl->timer, mode)) {
+ sl->task = NULL;
+ __set_current_state(TASK_RUNNING);
+ }
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1c954f330dfe..d51428867a33 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -60,15 +60,14 @@ EXPORT_SYMBOL(get_jiffies_64);
EXPORT_SYMBOL(jiffies);
-static int __init init_jiffies_clocksource(void)
-{
- return __clocksource_register(&clocksource_jiffies);
-}
-
-core_initcall(init_jiffies_clocksource);
+static bool cs_jiffies_registered __initdata;
struct clocksource * __init __weak clocksource_default_clock(void)
{
+ if (!cs_jiffies_registered) {
+ __clocksource_register(&clocksource_jiffies);
+ cs_jiffies_registered = true;
+ }
return &clocksource_jiffies;
}
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 4bca3f78c8ea..5fa0af66cf3f 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -57,6 +57,7 @@ ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
return tim;
}
+EXPORT_SYMBOL_GPL(do_timens_ktime_to_host);
static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
{
@@ -351,6 +352,7 @@ struct time_namespace init_time_ns = {
.user_ns = &init_user_ns,
.frozen_offsets = true,
};
+EXPORT_SYMBOL_GPL(init_time_ns);
void __init time_ns_init(void)
{
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0de2bb7cbec0..74775b94d11b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -19,7 +19,7 @@
#include "posix-timers.h"
-static void posix_cpu_timer_rearm(struct k_itimer *timer);
+static bool posix_cpu_timer_rearm(struct k_itimer *timer);
void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
{
@@ -1011,24 +1011,27 @@ static void check_process_timers(struct task_struct *tsk,
/*
* This is called from the signal code (via posixtimer_rearm)
* when the last timer signal was delivered and we have to reload the timer.
+ *
+ * Return true unconditionally so the core code assumes the timer to be
+ * armed. Otherwise it would requeue the signal.
*/
-static void posix_cpu_timer_rearm(struct k_itimer *timer)
+static bool posix_cpu_timer_rearm(struct k_itimer *timer)
{
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
- struct task_struct *p;
struct sighand_struct *sighand;
+ struct task_struct *p;
unsigned long flags;
u64 now;
- rcu_read_lock();
+ guard(rcu)();
p = cpu_timer_task_rcu(timer);
if (!p)
- goto out;
+ return true;
/* Protect timer list r/w in arm_timer() */
sighand = lock_task_sighand(p, &flags);
if (unlikely(sighand == NULL))
- goto out;
+ return true;
/*
* Fetch the current sample and update the timer's expiry time.
@@ -1045,8 +1048,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
*/
arm_timer(timer, p);
unlock_task_sighand(p, &flags);
-out:
- rcu_read_unlock();
+ return true;
}
/**
@@ -1504,6 +1506,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
spin_lock_irq(&timer.it_lock);
error = posix_cpu_timer_set(&timer, flags, &it, NULL);
if (error) {
+ posix_cpu_timer_del(&timer);
spin_unlock_irq(&timer.it_lock);
return error;
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 9331e1614124..436ba794cc0b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -288,16 +288,18 @@ static inline int timer_overrun_to_int(struct k_itimer *timr)
return (int)timr->it_overrun_last;
}
-static void common_hrtimer_rearm(struct k_itimer *timr)
+static bool common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
- hrtimer_restart(timer);
+ return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS);
}
static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr)
{
+ bool queued;
+
guard(spinlock)(&timr->it_lock);
/*
@@ -311,12 +313,18 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it
if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
return true;
- timr->kclock->timer_rearm(timr);
- timr->it_status = POSIX_TIMER_ARMED;
+ /* timer_rearm() updates timr::it_overrun */
+ queued = timr->kclock->timer_rearm(timr);
+
timr->it_overrun_last = timr->it_overrun;
timr->it_overrun = -1LL;
++timr->it_signal_seq;
info->si_overrun = timer_overrun_to_int(timr);
+
+ if (queued)
+ timr->it_status = POSIX_TIMER_ARMED;
+ else
+ posix_timer_queue_signal(timr);
return true;
}
@@ -795,7 +803,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
return timer_overrun_to_int(scoped_timer);
}
-static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
+static bool common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none)
{
struct hrtimer *timer = &timr->it.real.timer;
@@ -820,8 +828,11 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
hrtimer_set_expires(timer, expires);
- if (!sigev_none)
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ /* For sigev_none pretend that the timer is queued */
+ if (sigev_none)
+ return true;
+
+ return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS);
}
static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
@@ -903,9 +914,13 @@ int common_timer_set(struct k_itimer *timr, int flags,
expires = timens_ktime_to_host(timr->it_clock, expires);
sigev_none = timr->it_sigev_notify == SIGEV_NONE;
- kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
- if (!sigev_none)
- timr->it_status = POSIX_TIMER_ARMED;
+ if (kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none)) {
+ if (!sigev_none)
+ timr->it_status = POSIX_TIMER_ARMED;
+ } else {
+ /* Timer was already expired, queue the signal */
+ posix_timer_queue_signal(timr);
+ }
return 0;
}
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 7f259e845d24..4ea9611dd716 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -27,11 +27,11 @@ struct k_clock {
int (*timer_del)(struct k_itimer *timr);
void (*timer_get)(struct k_itimer *timr,
struct itimerspec64 *cur_setting);
- void (*timer_rearm)(struct k_itimer *timr);
+ bool (*timer_rearm)(struct k_itimer *timr);
s64 (*timer_forward)(struct k_itimer *timr, ktime_t now);
ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now);
int (*timer_try_to_cancel)(struct k_itimer *timr);
- void (*timer_arm)(struct k_itimer *timr, ktime_t expires,
+ bool (*timer_arm)(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none);
void (*timer_wait_running)(struct k_itimer *timr);
};
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cbbb87a0c6e7..98a9cae915c0 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -285,8 +285,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
touch_softlockup_watchdog_sched();
- if (is_idle_task(current))
- ts->idle_jiffies++;
/*
* In case the current tick fired too early past its expected
* expiration, make sure we don't bypass the next clock reprogramming
@@ -751,119 +749,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}
-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- ktime_t delta;
-
- if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
- return;
-
- delta = ktime_sub(now, ts->idle_entrytime);
-
- write_seqcount_begin(&ts->idle_sleeptime_seq);
- if (nr_iowait_cpu(smp_processor_id()) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-
- ts->idle_entrytime = now;
- tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
- write_seqcount_end(&ts->idle_sleeptime_seq);
-
- sched_clock_idle_wakeup_event();
-}
-
-static void tick_nohz_start_idle(struct tick_sched *ts)
-{
- write_seqcount_begin(&ts->idle_sleeptime_seq);
- ts->idle_entrytime = ktime_get();
- tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
- write_seqcount_end(&ts->idle_sleeptime_seq);
-
- sched_clock_idle_sleep_event();
-}
-
-static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
- bool compute_delta, u64 *last_update_time)
-{
- ktime_t now, idle;
- unsigned int seq;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
-
- do {
- seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
-
- if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- idle = ktime_add(*sleeptime, delta);
- } else {
- idle = *sleeptime;
- }
- } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
-
- return ktime_to_us(idle);
-
-}
-
-/**
- * get_cpu_idle_time_us - get the total idle time of a CPU
- * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
- * counters if NULL.
- *
- * Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds. Note that this is partially broken due to
- * the counter of iowait tasks that can be remotely updated without
- * any synchronization. Therefore it is possible to observe backward
- * values within two consecutive reads.
- *
- * This time is measured via accounting rather than sampling,
- * and is as accurate as ktime_get() is.
- *
- * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
- */
-u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
-{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-
- return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
- !nr_iowait_cpu(cpu), last_update_time);
-}
-EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
-
-/**
- * get_cpu_iowait_time_us - get the total iowait time of a CPU
- * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
- * counters if NULL.
- *
- * Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds. Note this is partially broken due to
- * the counter of iowait tasks that can be remotely updated without
- * any synchronization. Therefore it is possible to observe backward
- * values within two consecutive reads.
- *
- * This time is measured via accounting rather than sampling,
- * and is as accurate as ktime_get() is.
- *
- * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
- */
-u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
-{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-
- return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
- nr_iowait_cpu(cpu), last_update_time);
-}
-EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-
/* Simplified variant of hrtimer_forward_now() */
static ktime_t tick_forward_now(ktime_t expires, ktime_t now)
{
@@ -1273,7 +1158,7 @@ void tick_nohz_idle_stop_tick(void)
ts->idle_expires = expires;
if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
- ts->idle_jiffies = ts->last_jiffies;
+ kcpustat_dyntick_start(ts->idle_entrytime);
nohz_balance_enter_idle(cpu);
}
} else {
@@ -1286,6 +1171,20 @@ void tick_nohz_idle_retain_tick(void)
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
}
+static void tick_nohz_clock_sleep(struct tick_sched *ts)
+{
+ tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
+ sched_clock_idle_sleep_event();
+}
+
+static void tick_nohz_clock_wakeup(struct tick_sched *ts)
+{
+ if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) {
+ tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
+ sched_clock_idle_wakeup_event();
+ }
+}
+
/**
* tick_nohz_idle_enter - prepare for entering idle on the current CPU
*
@@ -1300,11 +1199,10 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
ts = this_cpu_ptr(&tick_cpu_sched);
-
WARN_ON_ONCE(ts->timer_expires_base);
-
tick_sched_flag_set(ts, TS_FLAG_INIDLE);
- tick_nohz_start_idle(ts);
+ ts->idle_entrytime = ktime_get();
+ tick_nohz_clock_sleep(ts);
local_irq_enable();
}
@@ -1332,10 +1230,14 @@ void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
- tick_nohz_start_idle(ts);
- else
+ if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) {
+ tick_nohz_clock_sleep(ts);
+ ts->idle_entrytime = ktime_get();
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
+ kcpustat_irq_exit(ts->idle_entrytime);
+ } else {
tick_nohz_full_update_tick(ts);
+ }
}
/**
@@ -1407,8 +1309,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
* If the next highres timer to expire is earlier than 'next_event', the
* idle governor needs to know that.
*/
- next_event = min_t(u64, next_event,
- hrtimer_next_event_without(&ts->sched_timer));
+ next_event = min(next_event, hrtimer_next_event_without(&ts->sched_timer));
return ktime_sub(next_event, now);
}
@@ -1429,36 +1330,20 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
return ts->idle_calls;
}
-static void tick_nohz_account_idle_time(struct tick_sched *ts,
- ktime_t now)
-{
- unsigned long ticks;
-
- ts->idle_exittime = now;
-
- if (vtime_accounting_enabled_this_cpu())
- return;
- /*
- * We stopped the tick in idle. update_process_times() would miss the
- * time we slept, as it does only a 1 tick accounting.
- * Enforce that this is accounted to idle !
- */
- ticks = jiffies - ts->idle_jiffies;
- /*
- * We might be one off. Do not randomly account a huge number of ticks!
- */
- if (ticks && ticks < LONG_MAX)
- account_idle_ticks(ticks);
-}
-
void tick_nohz_idle_restart_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
- ktime_t now = ktime_get();
- tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_time(ts, now);
+ /*
+ * Update entrytime here in case the tick restart is due to temporary
+ * polling on forced broadcast. The tick may be stopped again later within
+ * the same idle trip. The idle_entrytime was updated recently but make sure
+ * no tiny amount of idle time is accounted twice.
+ */
+ ts->idle_entrytime = ktime_get();
+ kcpustat_dyntick_stop(ts->idle_entrytime);
+ tick_nohz_restart_sched_tick(ts, ts->idle_entrytime);
}
}
@@ -1468,8 +1353,6 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
__tick_nohz_full_update_tick(ts, now);
else
tick_nohz_restart_sched_tick(ts, now);
-
- tick_nohz_account_idle_time(ts, now);
}
/**
@@ -1491,7 +1374,6 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
void tick_nohz_idle_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- bool idle_active, tick_stopped;
ktime_t now;
local_irq_disable();
@@ -1500,17 +1382,13 @@ void tick_nohz_idle_exit(void)
WARN_ON_ONCE(ts->timer_expires_base);
tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
- idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
- tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
+ tick_nohz_clock_wakeup(ts);
- if (idle_active || tick_stopped)
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
now = ktime_get();
-
- if (idle_active)
- tick_nohz_stop_idle(ts, now);
-
- if (tick_stopped)
+ kcpustat_dyntick_stop(now);
tick_nohz_idle_update_tick(ts, now);
+ }
local_irq_enable();
}
@@ -1565,11 +1443,14 @@ static inline void tick_nohz_irq_enter(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
- if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
+ tick_nohz_clock_wakeup(ts);
+
+ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED))
return;
+
now = ktime_get();
- if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
- tick_nohz_stop_idle(ts, now);
+ kcpustat_irq_enter(now);
+
/*
* If all CPUs are idle we may need to update a stale jiffies value.
* Note nohz_full is a special case: a timekeeper is guaranteed to stay
@@ -1577,8 +1458,7 @@ static inline void tick_nohz_irq_enter(void)
* rare case (typically stop machine). So we must make sure we have a
* last resort.
*/
- if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
- tick_nohz_update_jiffies(now);
+ tick_nohz_update_jiffies(now);
}
#else
@@ -1648,20 +1528,15 @@ void tick_setup_sched_timer(bool hrtimer)
void tick_sched_timer_dying(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t idle_sleeptime, iowait_sleeptime;
unsigned long idle_calls, idle_sleeps;
/* This must happen before hrtimers are migrated! */
if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
hrtimer_cancel(&ts->sched_timer);
- idle_sleeptime = ts->idle_sleeptime;
- iowait_sleeptime = ts->iowait_sleeptime;
idle_calls = ts->idle_calls;
idle_sleeps = ts->idle_sleeps;
memset(ts, 0, sizeof(*ts));
- ts->idle_sleeptime = idle_sleeptime;
- ts->iowait_sleeptime = iowait_sleeptime;
ts->idle_calls = idle_calls;
ts->idle_sleeps = idle_sleeps;
}
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index b4a7822f495d..79b9252047b1 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -44,9 +44,7 @@ struct tick_device {
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
- * @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_waketime: Time when the idle was interrupted
- * @idle_sleeptime_seq: sequence counter for data consistency
* @idle_entrytime: Time when the idle call was entered
* @last_jiffies: Base jiffies snapshot when next event was last computed
* @timer_expires_base: Base time clock monotonic for @timer_expires
@@ -55,9 +53,6 @@ struct tick_device {
* @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_exittime: Time when the idle state was left
- * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
- * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
* @check_clocks: Notification mechanism about clocksource changes
*/
@@ -73,12 +68,10 @@ struct tick_sched {
struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
- unsigned long idle_jiffies;
ktime_t idle_waketime;
unsigned int got_idle_tick;
/* Idle entry */
- seqcount_t idle_sleeptime_seq;
ktime_t idle_entrytime;
/* Tick stop */
@@ -90,11 +83,6 @@ struct tick_sched {
unsigned long idle_calls;
unsigned long idle_sleeps;
- /* Idle exit */
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
-
/* Full dynticks handling */
atomic_t tick_dep_mask;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 0d832317d576..771cef87ad3b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
- if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
+ if (new_ts.tv_nsec >= USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
new_ts.tv_nsec *= NSEC_PER_USEC;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c493a4010305..0d5b67f609bb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -67,6 +67,7 @@ static inline bool tk_is_aux(const struct timekeeper *tk)
{
return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
}
+static inline struct tk_data *aux_get_tk_data(clockid_t id);
#else
static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
{
@@ -77,6 +78,10 @@ static inline bool tk_is_aux(const struct timekeeper *tk)
{
return false;
}
+static inline struct tk_data *aux_get_tk_data(clockid_t id)
+{
+ return NULL;
+}
#endif
static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
@@ -315,6 +320,7 @@ static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
return clock->read(clock);
}
+
static inline void clocksource_disable_inline_read(void) { }
static inline void clocksource_enable_inline_read(void) { }
#endif
@@ -1182,44 +1188,107 @@ noinstr time64_t __ktime_get_real_seconds(void)
return tk->xtime_sec;
}
-/**
- * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
- * @systime_snapshot: pointer to struct receiving the system time snapshot
- */
-void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
+static inline u64 tk_clock_read_snapshot(const struct tk_read_base *tkr,
+ struct clocksource_hw_snapshot *chs)
{
- struct timekeeper *tk = &tk_core.timekeeper;
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ if (unlikely(clock->read_snapshot))
+ return clock->read_snapshot(clock, chs);
+
+ return clock->read(clock);
+}
+
+
+/**
+ * ktime_get_snapshot_id - Simultaneously snapshot a given clock ID with
+ * CLOCK_MONOTONIC_RAW and the underlying
+ * clocksource counter value.
+ * @clock_id: The clock ID to snapshot
+ * @systime_snapshot: Pointer to struct receiving the system time snapshot
+ */
+void ktime_get_snapshot_id(clockid_t clock_id, struct system_time_snapshot *systime_snapshot)
+{
+ ktime_t base_raw, base_sys, offs_sys, *offs, offs_zero = 0;
+ u64 nsec_raw, nsec_sys, now;
+ struct timekeeper *tk;
+ struct tk_data *tkd;
unsigned int seq;
- ktime_t base_raw;
- ktime_t base_real;
- ktime_t base_boot;
- u64 nsec_raw;
- u64 nsec_real;
- u64 now;
- WARN_ON_ONCE(timekeeping_suspended);
+ /* Invalidate the snapshot for all failure cases */
+ systime_snapshot->valid = false;
+
+ if (WARN_ON_ONCE(timekeeping_suspended))
+ return;
+
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ tkd = &tk_core;
+ offs = &tk_core.timekeeper.offs_real;
+ break;
+ /* Map RAW to MONOTONIC so the loop below is trivial */
+ case CLOCK_MONOTONIC_RAW:
+ case CLOCK_MONOTONIC:
+ tkd = &tk_core;
+ offs = &offs_zero;
+ break;
+ case CLOCK_BOOTTIME:
+ tkd = &tk_core;
+ offs = &tk_core.timekeeper.offs_boot;
+ break;
+ case CLOCK_AUX ... CLOCK_AUX_LAST:
+ tkd = aux_get_tk_data(clock_id);
+ if (!tkd)
+ return;
+ offs = &tkd->timekeeper.offs_aux;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ tk = &tkd->timekeeper;
do {
- seq = read_seqcount_begin(&tk_core.seq);
- now = tk_clock_read(&tk->tkr_mono);
+ struct clocksource_hw_snapshot chs = { };
+
+ seq = read_seqcount_begin(&tkd->seq);
+
+ /* Aux clocks can be invalid */
+ if (!tk->clock_valid)
+ return;
+
+ now = tk_clock_read_snapshot(&tk->tkr_mono, &chs);
systime_snapshot->cs_id = tk->tkr_mono.clock->id;
+
+ systime_snapshot->hw_cycles = chs.hw_cycles;
+ systime_snapshot->hw_csid = chs.hw_csid;
+
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
- base_real = ktime_add(tk->tkr_mono.base,
- tk_core.timekeeper.offs_real);
- base_boot = ktime_add(tk->tkr_mono.base,
- tk_core.timekeeper.offs_boot);
+
+ base_sys = tk->tkr_mono.base;
+ offs_sys = *offs;
base_raw = tk->tkr_raw.base;
- nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
- nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
- } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ nsec_sys = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
+ nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
+ } while (read_seqcount_retry(&tkd->seq, seq));
systime_snapshot->cycles = now;
- systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
- systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
- systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
+ systime_snapshot->systime = ktime_add_ns(base_sys, offs_sys + nsec_sys);
+ systime_snapshot->monoraw = ktime_add_ns(base_raw, nsec_raw);
+
+ /*
+ * Special case for PTP. Just transfer the raw time into sys,
+ * so the call sites can consistently use snap::systime.
+ */
+ if (clock_id == CLOCK_MONOTONIC_RAW)
+ systime_snapshot->systime = systime_snapshot->monoraw;
+ /* Tell the consumer that this snapshot is valid */
+ systime_snapshot->valid = true;
}
-EXPORT_SYMBOL_GPL(ktime_get_snapshot);
+EXPORT_SYMBOL_GPL(ktime_get_snapshot_id);
/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
@@ -1262,7 +1331,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
struct system_device_crosststamp *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- u64 corr_raw, corr_real;
+ u64 corr_raw, corr_sys;
bool interp_forward;
int ret;
@@ -1279,8 +1348,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
* Scale the monotonic raw time delta by:
* partial_history_cycles / total_history_cycles
*/
- corr_raw = (u64)ktime_to_ns(
- ktime_sub(ts->sys_monoraw, history->raw));
+ corr_raw = (u64)ktime_to_ns(ktime_sub(ts->sys_monoraw, history->monoraw));
ret = scale64_check_overflow(partial_history_cycles,
total_history_cycles, &corr_raw);
if (ret)
@@ -1288,30 +1356,29 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
/*
* If there is a discontinuity in the history, scale monotonic raw
- * correction by:
- * mult(real)/mult(raw) yielding the realtime correction
- * Otherwise, calculate the realtime correction similar to monotonic
- * raw calculation
+ * correction by:
+ * mult(sys)/mult(raw) yielding the system time correction
+ *
+ * Otherwise, calculate the system time correction similar to monotonic
+ * raw calculation
*/
if (discontinuity) {
- corr_real = mul_u64_u32_div
- (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
+ corr_sys = mul_u64_u32_div(corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
} else {
- corr_real = (u64)ktime_to_ns(
- ktime_sub(ts->sys_realtime, history->real));
- ret = scale64_check_overflow(partial_history_cycles,
- total_history_cycles, &corr_real);
+ corr_sys = (u64)ktime_to_ns(ktime_sub(ts->sys_systime, history->systime));
+ ret = scale64_check_overflow(partial_history_cycles, total_history_cycles,
+ &corr_sys);
if (ret)
return ret;
}
- /* Fixup monotonic raw and real time time values */
+ /* Fixup monotonic raw and system time time values */
if (interp_forward) {
- ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
- ts->sys_realtime = ktime_add_ns(history->real, corr_real);
+ ts->sys_monoraw = ktime_add_ns(history->monoraw, corr_raw);
+ ts->sys_systime = ktime_add_ns(history->systime, corr_sys);
} else {
ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
- ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
+ ts->sys_systime = ktime_sub_ns(ts->sys_systime, corr_sys);
}
return 0;
@@ -1368,6 +1435,8 @@ static bool convert_base_to_cs(struct system_counterval_t *scv)
return false;
scv->cycles += base->offset;
+ /* Set the clocksource ID as scv::cycles is now clocksource based */
+ scv->cs_id = cs->id;
return true;
}
@@ -1435,11 +1504,11 @@ EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);
/**
* get_device_system_crosststamp - Synchronously capture system/device timestamp
- * @get_time_fn: Callback to get simultaneous device time and
- * system counter from the device driver
+ * @get_time_fn: Callback to get simultaneous device time and system counter
+ * from the device driver
* @ctx: Context passed to get_time_fn()
- * @history_begin: Historical reference point used to interpolate system
- * time when counter provided by the driver is before the current interval
+ * @history_begin: Historical reference point used to interpolate system time when
+ * the counter value provided by the driver is before the current interval
* @xtstamp: Receives simultaneously captured system and device time
*
* Reads a timestamp from a device and correlates it to system time
@@ -1452,36 +1521,54 @@ int get_device_system_crosststamp(int (*get_time_fn)
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
- struct system_counterval_t system_counterval = {};
- struct timekeeper *tk = &tk_core.timekeeper;
- u64 cycles, now, interval_start;
- unsigned int clock_was_set_seq = 0;
- ktime_t base_real, base_raw;
- u64 nsec_real, nsec_raw;
+ u64 syscnt_cycles, cycles, now, interval_start;
+ unsigned int seq, clock_was_set_seq = 0;
+ ktime_t base_sys, base_raw, *offs;
+ u64 nsec_sys, nsec_raw;
u8 cs_was_changed_seq;
- unsigned int seq;
bool do_interp;
+ struct timekeeper *tk;
+ struct tk_data *tkd;
int ret;
+ switch (xtstamp->clock_id) {
+ case CLOCK_REALTIME:
+ tkd = &tk_core;
+ offs = &tk_core.timekeeper.offs_real;
+ break;
+ case CLOCK_AUX ... CLOCK_AUX_LAST:
+ tkd = aux_get_tk_data(xtstamp->clock_id);
+ if (!tkd)
+ return -ENODEV;
+ offs = &tkd->timekeeper.offs_aux;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -ENODEV;
+ }
+
+ tk = &tkd->timekeeper;
+
do {
- seq = read_seqcount_begin(&tk_core.seq);
+ seq = read_seqcount_begin(&tkd->seq);
/*
* Try to synchronously capture device time and a system
* counter value calling back into the device driver
*/
- ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
+ ret = get_time_fn(&xtstamp->device, &xtstamp->sys_counter, ctx);
if (ret)
return ret;
/*
* Verify that the clocksource ID associated with the captured
* system counter value is the same as for the currently
- * installed timekeeper clocksource
+ * installed timekeeper clocksource and convert to it.
*/
- if (system_counterval.cs_id == CSID_GENERIC ||
- !convert_base_to_cs(&system_counterval))
+ if (xtstamp->sys_counter.cs_id == CSID_GENERIC ||
+ !convert_base_to_cs(&xtstamp->sys_counter))
return -ENODEV;
- cycles = system_counterval.cycles;
+
+ cycles = syscnt_cycles = xtstamp->sys_counter.cycles;
/*
* Check whether the system counter value provided by the
@@ -1498,15 +1585,14 @@ int get_device_system_crosststamp(int (*get_time_fn)
do_interp = false;
}
- base_real = ktime_add(tk->tkr_mono.base,
- tk_core.timekeeper.offs_real);
+ base_sys = ktime_add(tk->tkr_mono.base, *offs);
base_raw = tk->tkr_raw.base;
- nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
+ nsec_sys = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
- } while (read_seqcount_retry(&tk_core.seq, seq));
+ } while (read_seqcount_retry(&tkd->seq, seq));
- xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
+ xtstamp->sys_systime = ktime_add_ns(base_sys, nsec_sys);
xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
/*
@@ -1523,24 +1609,19 @@ int get_device_system_crosststamp(int (*get_time_fn)
* clocksource change
*/
if (!history_begin ||
- !timestamp_in_interval(history_begin->cycles,
- cycles, system_counterval.cycles) ||
+ !timestamp_in_interval(history_begin->cycles, cycles, syscnt_cycles) ||
history_begin->cs_was_changed_seq != cs_was_changed_seq)
return -EINVAL;
- partial_history_cycles = cycles - system_counterval.cycles;
+
+ partial_history_cycles = cycles - syscnt_cycles;
total_history_cycles = cycles - history_begin->cycles;
- discontinuity =
- history_begin->clock_was_set_seq != clock_was_set_seq;
+ discontinuity = history_begin->clock_was_set_seq != clock_was_set_seq;
- ret = adjust_historical_crosststamp(history_begin,
- partial_history_cycles,
- total_history_cycles,
- discontinuity, xtstamp);
- if (ret)
- return ret;
+ ret = adjust_historical_crosststamp(history_begin, partial_history_cycles,
+ total_history_cycles, discontinuity, xtstamp);
}
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 04d928c21aba..655a8c6cd84d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1932,7 +1932,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
*/
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
- u64 nextevt = hrtimer_get_next_event();
+ u64 nextevt = ktime_to_ns(hrtimer_get_next_event());
/*
* If high resolution timers are enabled
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 427d7ddea3af..514802def1e0 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -152,14 +152,10 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_flag(highres, TS_FLAG_HIGHRES);
P_ns(last_tick);
P_flag(tick_stopped, TS_FLAG_STOPPED);
- P(idle_jiffies);
P(idle_calls);
P(idle_sleeps);
P_ns(idle_entrytime);
P_ns(idle_waketime);
- P_ns(idle_exittime);
- P_ns(idle_sleeptime);
- P_ns(iowait_sleeptime);
P(last_jiffies);
P(next_timer);
P_ns(idle_expires);
@@ -256,7 +252,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.10\n");
+ SEQ_printf(m, "Timer List Version: v0.11\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 155eeaea4113..806c23cf71fc 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -102,7 +102,7 @@
* active CPU/group information atomic_try_cmpxchg() is used instead and only
* the per CPU tmigr_cpu->lock is held.
*
- * During the setup of groups tmigr_level_list is required. It is protected by
+ * During the setup of groups, hier->level_list is required. It is protected by
* @tmigr_mutex.
*
* When @timer_base->lock as well as tmigr related locks are required, the lock
@@ -416,13 +416,12 @@
*/
static DEFINE_MUTEX(tmigr_mutex);
-static struct list_head *tmigr_level_list __read_mostly;
+
+static LIST_HEAD(tmigr_hierarchy_list);
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
-static struct tmigr_group *tmigr_root;
-
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
/*
@@ -978,8 +977,12 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
/* Drop the lock to allow the remote CPU to exit idle */
raw_spin_unlock_irq(&tmc->lock);
- if (cpu != smp_processor_id())
- timer_expire_remote(cpu);
+ /*
+ * This can't exclude the local CPU because jiffies might have advanced
+ * after the timer softirq invoked run_timer_base(BASE_GLOBAL) and the
+ * point where the jiffies snapshot @jif was taken in tmigr_handle_remote().
+ */
+ timer_expire_remote(cpu);
/*
* Lock ordering needs to be preserved - timer_base locks before tmigr
@@ -1465,6 +1468,34 @@ static long tmigr_trigger_active(void *unused)
return 0;
}
+static unsigned int tmigr_get_capacity(int cpu)
+{
+ /*
+ * nohz_full CPUs need to make sure there is always an available (online)
+ * and never idle migrator to handle all their global timers. That duty
+ * is served by the timekeeper which then never stops its tick. But the
+ * timekeeper must then belong to the same hierarchy as all the nohz_full
+ * CPUs. Simply turn off capacity awareness when nohz_full is running.
+ */
+ if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN))
+ return SCHED_CAPACITY_SCALE;
+ else
+ return arch_scale_cpu_capacity(cpu);
+}
+
+static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu)
+{
+ unsigned int capacity = tmigr_get_capacity(cpu);
+ struct tmigr_hierarchy *iter;
+
+ list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
+ if (iter->capacity == capacity)
+ return iter;
+ }
+
+ return NULL;
+}
+
static int tmigr_clear_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1489,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
}
if (firstexp != KTIME_MAX) {
- migrator = cpumask_any(tmigr_available_cpumask);
- work_on_cpu(migrator, tmigr_trigger_active, NULL);
+ struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu);
+
+ if (WARN_ON_ONCE(!hier))
+ return -EINVAL;
+
+ migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask);
+ if (migrator < nr_cpu_ids) {
+ work_on_cpu(migrator, tmigr_trigger_active, NULL);
+ } else {
+ /*
+ * If deactivation returned an expiration, it belongs to an available
+ * nohz CPU in the hierarchy.
+ */
+ WARN_ONCE(1, "Expected available CPU in the hierarchy\n");
+ }
}
return 0;
@@ -1653,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true;
}
-static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl)
{
struct tmigr_group *tmp, *group = NULL;
lockdep_assert_held(&tmigr_mutex);
/* Try to attach to an existing group first */
- list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
+ list_for_each_entry(tmp, &hier->level_list[lvl], list) {
/*
* If @lvl is below the cross NUMA node level, check whether
* this group belongs to the same NUMA node.
@@ -1694,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
tmigr_init_group(group, lvl, node);
/* Setup successful. Add it to the hierarchy */
- list_add(&group->list, &tmigr_level_list[lvl]);
+ list_add(&group->list, &hier->level_list[lvl]);
trace_tmigr_group_set(group);
return group;
}
-static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate)
{
- if (!group->parent && group != tmigr_root) {
+ if (!group->parent && group != hier->root) {
/*
* This is the new top-level, prepare its groupmask in advance
* to avoid accidents where yet another new top-level is
@@ -1717,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate)
}
-static void tmigr_connect_child_parent(struct tmigr_group *child,
- struct tmigr_group *parent,
- bool activate)
+static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child,
+ struct tmigr_group *parent, bool activate)
{
- if (tmigr_init_root(parent, activate)) {
+ if (tmigr_init_root(hier, parent, activate)) {
/*
* The previous top level had prepared its groupmask already,
* simply account it in advance as the first child. If some groups
@@ -1754,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
*/
smp_store_release(&child->parent, parent);
- trace_tmigr_connect_child_parent(child);
+ trace_tmigr_connect_child_parent(hier, child);
}
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
- struct tmigr_group *start, bool activate)
+static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
+ unsigned int node, struct tmigr_group *start, bool activate)
{
- struct tmigr_group *group, *child, **stack;
+ struct tmigr_group *root = hier->root, *group, *child, **stack;
int i, top = 0, err = 0, start_lvl = 0;
bool root_mismatch = false;
@@ -1773,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
start_lvl = start->level + 1;
}
- if (tmigr_root)
- root_mismatch = tmigr_root->numa_node != node;
+ if (root)
+ root_mismatch = root->numa_node != node;
for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
- group = tmigr_get_group(node, i);
+ group = tmigr_get_group(hier, node, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
i--;
@@ -1799,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
if (group->parent)
break;
if ((!root_mismatch || i >= tmigr_crossnode_level) &&
- list_is_singular(&tmigr_level_list[i]))
+ list_is_singular(&hier->level_list[i]))
break;
}
@@ -1827,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
- tmigr_init_root(group, activate);
+ tmigr_init_root(hier, group, activate);
- trace_tmigr_connect_cpu_parent(tmc);
+ trace_tmigr_connect_cpu_parent(hier, tmc);
/* There are no children that need to be connected */
continue;
} else {
child = stack[i - 1];
- tmigr_connect_child_parent(child, group, activate);
+ tmigr_connect_child_parent(hier, child, group, activate);
}
}
@@ -1860,31 +1903,54 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
* child to the new parents. So tmigr_active_up() activates the
* new parents while walking up from the old root to the new.
*
- * * It is ensured that @start is active, as this setup path is
- * executed in hotplug prepare callback. This is executed by an
- * already connected and !idle CPU. Even if all other CPUs go idle,
- * the CPU executing the setup will be responsible up to current top
- * level group. And the next time it goes inactive, it will release
- * the new childmask and parent to subsequent walkers through this
- * @child. Therefore propagate active state unconditionally.
+ * * It is ensured that @start is active, (or on the way to be activated
+ * by another CPU that woke up before the current one) as this setup path
+ * is executed in hotplug prepare callback. This is executed by an already
+ * connected and !idle CPU in the hierarchy.
+ *
+ * * The below RmW atomic operation ensures that:
+ *
+ * 1) If the old root has been completely activated, the latest state is
+ * acquired (the below implicit acquire pairs with the implicit release
+ * from cmpxchg() in tmigr_active_up()).
+ *
+ * 2) If the old root is still on the way to be activated, the lagging behind
+ * CPU performing the activation will acquire the links up to the new root.
+ * (The below implicit release pairs with the implicit acquire from cmpxchg()
+ * in tmigr_active_up()).
+ *
+ * 3) Every subsequent CPU below the old root will acquire the new links while
+ * walking through the old root (The below implicit release pairs with the
+ * implicit acquire from cmpxchg() in either tmigr_active_up()) or
+ * tmigr_inactive_up().
*/
- state.state = atomic_read(&start->migr_state);
- WARN_ON_ONCE(!state.active);
+ state.state = atomic_fetch_or(0, &start->migr_state);
WARN_ON_ONCE(!start->parent);
- data.childmask = start->groupmask;
- __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ /*
+ * If the state of the old root is inactive, another CPU is on its way to activate
+ * it and propagate to the new root.
+ */
+ if (state.active) {
+ data.childmask = start->groupmask;
+ __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ }
+ } else if (start) {
+ union tmigr_state state;
+
+ /* Remote activation assumes the whole target's hierarchy is inactive */
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(state.active);
}
/* Root update */
- if (list_is_singular(&tmigr_level_list[top])) {
- group = list_first_entry(&tmigr_level_list[top],
- typeof(*group), list);
+ if (list_is_singular(&hier->level_list[top])) {
+ group = list_first_entry(&hier->level_list[top], typeof(*group), list);
WARN_ON_ONCE(group->parent);
- if (tmigr_root) {
+ if (root) {
/* Old root should be the same or below */
- WARN_ON_ONCE(tmigr_root->level > top);
+ WARN_ON_ONCE(root->level > top);
}
- tmigr_root = group;
+ hier->root = group;
}
out:
kfree(stack);
@@ -1892,34 +1958,123 @@ out:
return err;
}
+static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu)
+{
+ struct tmigr_hierarchy *hier;
+
+ hier = __tmigr_get_hierarchy(cpu);
+
+ if (hier)
+ return hier;
+
+ hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels);
+ if (!hier)
+ return ERR_PTR(-ENOMEM);
+
+ hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!hier->cpumask) {
+ kfree(hier);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (int i = 0; i < tmigr_hierarchy_levels; i++)
+ INIT_LIST_HEAD(&hier->level_list[i]);
+
+ hier->capacity = tmigr_get_capacity(cpu);
+ list_add_tail(&hier->node, &tmigr_hierarchy_list);
+
+ return hier;
+}
+
+static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
+ struct tmigr_group *old_root, bool activate)
+{
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == smp_processor_id());
+ if (activate) {
+ /*
+ * The current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+ }
+
+ return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+ struct tmigr_group *old_root = arg;
+ struct tmigr_hierarchy *hier;
+ int cpu = smp_processor_id();
+
+ hier = __tmigr_get_hierarchy(cpu);
+ if (WARN_ON_ONCE(!hier))
+ return -EINVAL;
+
+ return tmigr_connect_old_root(hier, cpu, old_root, true);
+}
+
static int tmigr_add_cpu(unsigned int cpu)
{
- struct tmigr_group *old_root = tmigr_root;
+ struct tmigr_hierarchy *hier;
+ struct tmigr_group *old_root;
int node = cpu_to_node(cpu);
int ret;
guard(mutex)(&tmigr_mutex);
- ret = tmigr_setup_groups(cpu, node, NULL, false);
+ hier = tmigr_get_hierarchy(cpu);
+ if (IS_ERR(hier))
+ return PTR_ERR(hier);
+
+ old_root = hier->root;
+
+ ret = tmigr_setup_groups(hier, cpu, node, NULL, false);
+
+ if (ret < 0)
+ return ret;
/* Root has changed? Connect the old one to the new */
- if (ret >= 0 && old_root && old_root != tmigr_root) {
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
- /*
- * The (likely) current CPU is expected to be online in the hierarchy,
- * otherwise the old root may not be active as expected.
- */
- WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
- ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+ if (old_root && old_root != hier->root) {
+ guard(migrate)();
+
+ if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
+ /*
+ * If the target belong to the same hierarchy, the old root is expected
+ * to be active. Link and propagate to the new root.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, true);
+ } else {
+ int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
+
+ if (target < nr_cpu_ids) {
+ /*
+ * If the target doesn't belong to the same hierarchy as the current
+ * CPU, activate from a relevant one to make sure the old root is
+ * active.
+ */
+ ret = work_on_cpu(target, connect_old_root_work, old_root);
+ } else {
+ /*
+ * No other available CPUs in the remote hierarchy. Link the
+ * old root remotely but don't propagate activation since the
+ * old root is not expected to be active.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, false);
+ }
+ }
}
+ if (ret >= 0)
+ cpumask_set_cpu(cpu, hier->cpumask);
+
return ret;
}
@@ -1952,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu)
static int __init tmigr_init(void)
{
- unsigned int cpulvl, nodelvl, cpus_per_node, i;
+ unsigned int cpulvl, nodelvl, cpus_per_node;
unsigned int nnodes = num_possible_nodes();
unsigned int ncpus = num_possible_cpus();
int ret = -ENOMEM;
@@ -1999,14 +2154,6 @@ static int __init tmigr_init(void)
*/
tmigr_crossnode_level = cpulvl;
- tmigr_level_list = kzalloc_objs(struct list_head,
- tmigr_hierarchy_levels);
- if (!tmigr_level_list)
- goto err;
-
- for (i = 0; i < tmigr_hierarchy_levels; i++)
- INIT_LIST_HEAD(&tmigr_level_list[i]);
-
pr_info("Timer migration: %d hierarchy levels; %d children per group;"
" %d crossnode level\n",
tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 70879cde6fdd..31735dd52327 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -6,6 +6,24 @@
#define TMIGR_CHILDREN_PER_GROUP 8
/**
+ * struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity.
+ * Homogeneous systems have only one hierarchy.
+ * Heterogenous have one hierarchy per CPU capacity.
+ * @cpumask: CPUs belonging to this hierarchy
+ * @root: The current root of the hierarchy
+ * @capacity: CPU capacity associated to this hierarchy
+ * @node: Node in the global hierarchy list
+ * @level_list: Per level lists of tmigr groups
+ */
+struct tmigr_hierarchy {
+ struct cpumask *cpumask;
+ struct tmigr_group *root;
+ unsigned long capacity;
+ struct list_head node;
+ struct list_head level_list[];
+};
+
+/**
* struct tmigr_event - a timer event associated to a CPU
* @nextevt: The node to enqueue an event in the parent group queue
* @cpu: The CPU to which this event belongs
@@ -75,15 +93,17 @@ struct tmigr_group {
/**
* struct tmigr_cpu - timer migration per CPU group
* @lock: Lock protecting the tmigr_cpu group information
- * @online: Indicates whether the CPU is online; In deactivate path
- * it is required to know whether the migrator in the top
- * level group is to be set offline, while a timer is
- * pending. Then another online CPU needs to be notified to
- * take over the migrator role. Furthermore the information
- * is required in CPU hotplug path as the CPU is able to go
- * idle before the timer migration hierarchy hotplug AP is
- * reached. During this phase, the CPU has to handle the
+ * @available: Indicates whether the CPU is available for handling
+ * global timers. In the deactivate path it is required to
+ * know whether the migrator in the top level group is to
+ * be set offline, while a timer is pending. Then another
+ * available CPU needs to be notified to take over the
+ * migrator role. Furthermore the information is required
+ * in the CPU hotplug path as the CPU is able to go idle
+ * before the timer migration hierarchy hotplug callback is
+ * reached. During this phase, the CPU has to handle the
* global timers on its own and must not act as a migrator.
+
* @idle: Indicates whether the CPU is idle in the timer migration
* hierarchy
* @remote: Is set when timers of the CPU are expired remotely
diff --git a/kernel/torture.c b/kernel/torture.c
index 62c1ac777694..77cb3589b19f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -972,3 +972,19 @@ void _torture_stop_kthread(char *m, struct task_struct **tp)
*tp = NULL;
}
EXPORT_SYMBOL_GPL(_torture_stop_kthread);
+
+/*
+ * Set the specified task's niceness value, saturating at limits.
+ * Saturating noisily, but saturating.
+ */
+void torture_sched_set_normal(struct task_struct *t, int nice)
+{
+ int realnice = nice;
+
+ if (WARN_ON_ONCE(realnice > MAX_NICE))
+ realnice = MAX_NICE;
+ if (WARN_ON_ONCE(realnice < MIN_NICE))
+ realnice = MIN_NICE;
+ sched_set_normal(t, realnice);
+}
+EXPORT_SYMBOL_GPL(torture_sched_set_normal);
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1decdce8cbef..8d3d96e847d8 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -143,8 +143,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
targets += undefsyms_base.o
KASAN_SANITIZE_undefsyms_base.o := y
-UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
- __msan simple_ring_buffer \
+UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \
+ __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \
$(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
quiet_cmd_check_undefined = NM $<
@@ -154,7 +154,8 @@ quiet_cmd_check_undefined = NM $<
echo "Unexpected symbols in $<:" >&2; \
echo "$$undefsyms" >&2; \
false; \
- fi
+ fi; \
+ touch $@
$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
$(call if_changed,check_undefined)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index af7079aa0f36..82f8feea6931 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -23,6 +23,7 @@
#include <linux/sort.h>
#include <linux/key.h>
#include <linux/namei.h>
+#include <linux/file.h>
#include <net/bpf_sk_storage.h>
@@ -42,6 +43,7 @@
#define MAX_UPROBE_MULTI_CNT (1U << 20)
#define MAX_KPROBE_MULTI_CNT (1U << 20)
+#define MAX_TRACING_MULTI_CNT (1U << 20)
#ifdef CONFIG_MODULES
struct bpf_trace_module {
@@ -152,6 +154,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
return ret;
}
+/**
+ * trace_call_bpf_faultable - invoke BPF program in faultable context
+ * @call: tracepoint event
+ * @ctx: opaque context pointer
+ *
+ * Variant of trace_call_bpf() for faultable tracepoints (syscall
+ * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace
+ * for lifetime protection and bpf_prog_run_array_sleepable() for per-program
+ * RCU flavor selection, following the uprobe pattern.
+ *
+ * Per-program recursion protection is provided by
+ * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not
+ * needed because syscall tracepoints cannot self-recurse.
+ *
+ * Must be called from a faultable/preemptible context.
+ */
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+ struct bpf_prog_array *prog_array;
+
+ might_fault();
+ guard(rcu_tasks_trace)();
+
+ prog_array = rcu_dereference_check(call->prog_array,
+ rcu_read_lock_trace_held());
+ return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run);
+}
+
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
@@ -1305,7 +1335,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog)
static inline bool is_trace_fsession(const struct bpf_prog *prog)
{
return prog->type == BPF_PROG_TYPE_TRACING &&
- prog->expected_attach_type == BPF_TRACE_FSESSION;
+ (prog->expected_attach_type == BPF_TRACE_FSESSION ||
+ prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI);
}
static const struct bpf_func_proto *
@@ -2072,11 +2103,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
static __always_inline
void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
+ struct srcu_ctr __percpu *scp = NULL;
struct bpf_prog *prog = link->link.prog;
+ bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
- rcu_read_lock_dont_migrate();
+ if (sleepable) {
+ scp = rcu_read_lock_tasks_trace();
+ migrate_disable();
+ } else {
+ rcu_read_lock_dont_migrate();
+ }
+
if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
goto out;
@@ -2085,12 +2124,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
run_ctx.bpf_cookie = link->cookie;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
- (void) bpf_prog_run(prog, args);
+ (void)bpf_prog_run(prog, args);
bpf_reset_run_ctx(old_run_ctx);
out:
bpf_prog_put_recursion_context(prog);
- rcu_read_unlock_migrate();
+
+ if (sleepable) {
+ migrate_enable();
+ rcu_read_unlock_tasks_trace(scp);
+ } else {
+ rcu_read_unlock_migrate();
+ }
}
#define UNPACK(...) __VA_ARGS__
@@ -2384,7 +2429,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link)
struct bpf_kprobe_multi_link *kmulti_link;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
- unregister_fprobe(&kmulti_link->fp);
+ /* Don't wait for RCU GP here. */
+ unregister_fprobe_async(&kmulti_link->fp);
kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}
@@ -3169,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
return run_ctx->uprobe->cookie;
}
+static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path)
+{
+ void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
+ u32 path_fd = attr->link_create.uprobe_multi.path_fd;
+ u32 flags = attr->link_create.uprobe_multi.flags;
+
+ if (flags & BPF_F_UPROBE_MULTI_PATH_FD) {
+ /*
+ * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is
+ * identified by path_fd, upath must be NULL.
+ */
+ if (upath)
+ return -EINVAL;
+
+ CLASS(fd, f)(path_fd);
+ if (fd_empty(f))
+ return -EBADF;
+ *path = fd_file(f)->f_path;
+ path_get(path);
+ return 0;
+ }
+
+ /*
+ * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved
+ * relative to the cwd (AT_FDCWD) or absolute using the upath string.
+ */
+ if (!upath || path_fd)
+ return -EINVAL;
+
+ return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path);
+}
+
int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_uprobe_multi_link *link = NULL;
@@ -3178,10 +3256,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
struct task_struct *task = NULL;
unsigned long __user *uoffsets;
u64 __user *ucookies;
- void __user *upath;
+ unsigned long size;
u32 flags, cnt, i;
struct path path;
- char *name;
pid_t pid;
int err;
@@ -3196,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
return -EINVAL;
flags = attr->link_create.uprobe_multi.flags;
- if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
+ if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD))
return -EINVAL;
/*
- * path, offsets and cnt are mandatory,
+ * offsets and cnt are mandatory,
* ref_ctr_offsets and cookies are optional
*/
- upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
cnt = attr->link_create.uprobe_multi.cnt;
pid = attr->link_create.uprobe_multi.pid;
- if (!upath || !uoffsets || !cnt || pid < 0)
+ if (!uoffsets || !cnt || pid < 0)
return -EINVAL;
if (cnt > MAX_UPROBE_MULTI_CNT)
return -E2BIG;
@@ -3216,14 +3292,17 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
- name = strndup_user(upath, PATH_MAX);
- if (IS_ERR(name)) {
- err = PTR_ERR(name);
- return err;
- }
+ /*
+ * All uoffsets/uref_ctr_offsets/ucookies arrays have the same value
+ * size, we need to check their address range is safe for __get_user
+ * calls.
+ */
+ size = sizeof(*uoffsets) * cnt;
+ if (!access_ok(uoffsets, size) || !access_ok(uref_ctr_offsets, size) ||
+ !access_ok(ucookies, size))
+ return -EFAULT;
- err = kern_path(name, LOOKUP_FOLLOW, &path);
- kfree(name);
+ err = bpf_uprobe_multi_get_path(attr, &path);
if (err)
return err;
@@ -3397,12 +3476,12 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc
* direct calls into all the specific callback implementations
* (copy_user_data_sleepable, copy_user_data_nofault, and so on)
*/
-static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
+static __always_inline int __bpf_dynptr_copy_str(const struct bpf_dynptr *dptr, u64 doff, u64 size,
const void *unsafe_src,
copy_fn_t str_copy_fn,
struct task_struct *tsk)
{
- struct bpf_dynptr_kern *dst;
+ const struct bpf_dynptr_kern *dst;
u64 chunk_sz, off;
void *dst_slice;
int cnt, err;
@@ -3438,7 +3517,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64
u64 size, const void *unsafe_src,
copy_fn_t copy_fn, struct task_struct *tsk)
{
- struct bpf_dynptr_kern *dst;
+ const struct bpf_dynptr_kern *dst;
void *dst_slice;
char buf[256];
u64 off, chunk_sz;
@@ -3539,49 +3618,49 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
-__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_user_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
copy_kernel_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
copy_kernel_str_nofault, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
@@ -3589,7 +3668,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
copy_user_data_sleepable, tsk);
}
-__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
@@ -3598,3 +3677,203 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64
}
__bpf_kfunc_end_defs();
+
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
+ defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS)
+
+static void bpf_tracing_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+
+ WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link));
+}
+
+static void bpf_tracing_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+
+ kvfree(tr_link->fexits);
+ kvfree(tr_link->cookies);
+ kvfree(tr_link);
+}
+
+#ifdef CONFIG_PROC_FS
+static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_tracing_multi_link *tr_link =
+ container_of(link, struct bpf_tracing_multi_link, link);
+ bool has_cookies = !!tr_link->cookies;
+
+ seq_printf(seq, "attach_type:\t%u\n", tr_link->link.attach_type);
+ seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt);
+
+ seq_printf(seq, "%s\t %s\t %s\t %s\n", "obj-id", "btf-id", "cookie", "func");
+ for (int i = 0; i < tr_link->nodes_cnt; i++) {
+ struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i];
+ u32 btf_id, obj_id;
+
+ bpf_trampoline_unpack_key(mnode->trampoline->key, &obj_id, &btf_id);
+ seq_printf(seq, "%u\t %u\t %llu\t %pS\n",
+ obj_id, btf_id,
+ has_cookies ? tr_link->cookies[i] : 0,
+ (void *) mnode->trampoline->ip);
+
+ cond_resched();
+ }
+}
+#endif
+
+static const struct bpf_link_ops bpf_tracing_multi_link_lops = {
+ .release = bpf_tracing_multi_link_release,
+ .dealloc_deferred = bpf_tracing_multi_link_dealloc,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_tracing_multi_show_fdinfo,
+#endif
+};
+
+static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused)
+{
+ u32 a = *(u32 *) pa;
+ u32 b = *(u32 *) pb;
+
+ return (a > b) - (a < b);
+}
+
+static void ids_swap_r(void *a, void *b, int size __maybe_unused,
+ const void *priv __maybe_unused)
+{
+ u64 *cookie_a, *cookie_b, *cookies;
+ u32 *id_a = a, *id_b = b, *ids;
+ void **data = (void **) priv;
+
+ ids = data[0];
+ cookies = data[1];
+
+ if (cookies) {
+ cookie_a = cookies + (id_a - ids);
+ cookie_b = cookies + (id_b - ids);
+ swap(*cookie_a, *cookie_b);
+ }
+ swap(*id_a, *id_b);
+}
+
+static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt)
+{
+ void *data[2] = { ids, cookies };
+ int err = 0;
+
+ /*
+ * Sort ids array (together with cookies array if defined)
+ * and check it for duplicates. The ids and cookies arrays
+ * are left sorted.
+ */
+ sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data);
+
+ for (int i = 1; i < cnt; i++) {
+ if (ids[i] == ids[i - 1]) {
+ err = -EINVAL;
+ break;
+ }
+ }
+ return err;
+}
+
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+ struct bpf_tracing_multi_link *link = NULL;
+ struct bpf_tramp_node *fexits = NULL;
+ struct bpf_link_primer link_primer;
+ u32 cnt, *ids = NULL;
+ u64 __user *ucookies;
+ u64 *cookies = NULL;
+ u32 __user *uids;
+ int err;
+
+ uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids);
+ cnt = attr->link_create.tracing_multi.cnt;
+
+ if (!cnt || !uids)
+ return -EINVAL;
+ if (cnt > MAX_TRACING_MULTI_CNT)
+ return -E2BIG;
+ if (attr->link_create.flags || attr->link_create.target_fd)
+ return -EINVAL;
+
+ ids = kvmalloc_objs(*ids, cnt);
+ if (!ids)
+ return -ENOMEM;
+
+ if (copy_from_user(ids, uids, cnt * sizeof(*ids))) {
+ err = -EFAULT;
+ goto error;
+ }
+
+ ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc_objs(*cookies, cnt);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
+ err = check_dup_ids(ids, cookies, cnt);
+ if (err)
+ goto error;
+
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) {
+ fexits = kvmalloc_objs(*fexits, cnt);
+ if (!fexits) {
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ link = kvzalloc_flex(*link, nodes, cnt);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI,
+ &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ link->nodes_cnt = cnt;
+ link->cookies = cookies;
+ link->fexits = fexits;
+
+ err = bpf_trampoline_multi_attach(prog, ids, link);
+ kvfree(ids);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ return bpf_link_settle(&link_primer);
+
+error:
+ kvfree(fexits);
+ kvfree(cookies);
+ kvfree(ids);
+ kvfree(link);
+ return err;
+}
+
+#else
+
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index cc49ebd2a773..f378613ad120 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp)
}
/**
- * unregister_fprobe() - Unregister fprobe.
+ * unregister_fprobe_async() - Unregister fprobe without RCU GP wait
* @fp: A fprobe data structure to be unregistered.
*
* Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will NOT wait until the fprobe is no longer used.
*
* Return 0 if @fp is unregistered successfully, -errno if not.
*/
-int unregister_fprobe(struct fprobe *fp)
+int unregister_fprobe_async(struct fprobe *fp)
{
guard(mutex)(&fprobe_mutex);
if (!fp || !fprobe_registered(fp))
@@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp)
return unregister_fprobe_nolock(fp);
}
+
+/**
+ * unregister_fprobe() - Unregister fprobe with RCU GP wait
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will block until the fprobe is no longer used.
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret = unregister_fprobe_async(fp);
+
+ if (!ret)
+ synchronize_rcu();
+ return ret;
+}
EXPORT_SYMBOL_GPL(unregister_fprobe);
static int __init fprobe_initcall(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b2611de3f594..f93e34dd2328 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
return __ftrace_lookup_ip(hash, ip);
}
-static void __add_hash_entry(struct ftrace_hash *hash,
- struct ftrace_func_entry *entry)
+void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry)
{
struct hlist_head *hhd;
unsigned long key;
@@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne
entry->ip = ip;
entry->direct = direct;
- __add_hash_entry(hash, entry);
+ add_ftrace_hash_entry(hash, entry);
return entry;
}
@@ -1249,6 +1248,25 @@ remove_hash_entry(struct ftrace_hash *hash,
hash->count--;
}
+void ftrace_hash_remove(struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+ struct hlist_head *hhd;
+ struct hlist_node *tn;
+ int size;
+ int i;
+
+ if (!hash || !hash->count)
+ return;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hhd = &hash->buckets[i];
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist)
+ remove_hash_entry(hash, entry);
+ }
+ FTRACE_WARN_ON(hash->count);
+}
+
static void ftrace_hash_clear(struct ftrace_hash *hash)
{
struct hlist_head *hhd;
@@ -1458,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
hhd = &src->buckets[i];
hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
remove_hash_entry(src, entry);
- __add_hash_entry(new_hash, entry);
+ add_ftrace_hash_entry(new_hash, entry);
}
}
return new_hash;
@@ -5341,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
map->entry.ip = ip;
map->data = data;
- __add_hash_entry(&mapper->hash, &map->entry);
+ add_ftrace_hash_entry(&mapper->hash, &map->entry);
return 0;
}
@@ -6288,11 +6306,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-static unsigned long hash_count(struct ftrace_hash *hash)
+static inline unsigned long hash_count(struct ftrace_hash *hash)
{
return hash ? hash->count : 0;
}
+unsigned long ftrace_hash_count(struct ftrace_hash *hash)
+{
+ return hash_count(hash);
+}
+
/**
* hash_add - adds two struct ftrace_hash and returns the result
* @a: struct ftrace_hash object
diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c
index 6c1b7701ddae..a3e2c9b606eb 100644
--- a/kernel/trace/remote_test.c
+++ b/kernel/trace/remote_test.c
@@ -110,9 +110,9 @@ static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unus
return remote_test_buffer_desc;
err_unload:
- for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
remote_test_unload_simple_rb(rb_desc->cpu);
- trace_remote_free_buffer(remote_test_buffer_desc);
+ trace_remote_free_buffer(desc);
err_free_desc:
kfree(desc);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5326924615a4..ebae64ec2f11 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7,6 +7,7 @@
#include <linux/ring_buffer_types.h>
#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -31,6 +32,7 @@
#include <linux/oom.h>
#include <linux/mm.h>
+#include <asm/ring_buffer.h>
#include <asm/local64.h>
#include <asm/local.h>
#include <asm/setup.h>
@@ -559,6 +561,7 @@ struct trace_buffer {
unsigned long range_addr_start;
unsigned long range_addr_end;
+ struct notifier_block flush_nb;
struct ring_buffer_meta *meta;
@@ -2521,6 +2524,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+ ring_buffer_record_off(buffer);
+ arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+ return NOTIFY_DONE;
+}
+
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
@@ -2651,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
+ /* Persistent ring buffer needs to flush cache before reboot. */
+ if (start && end) {
+ buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+ atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+ }
+
return_ptr(buffer);
fail_free_buffers:
@@ -2749,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
+ if (buffer->range_addr_start && buffer->range_addr_end)
+ atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
irq_work_sync(&buffer->irq_work.work);
@@ -3769,13 +3791,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return skip_time_extend(event);
}
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
-{
- return true;
-}
-#endif
-
static void
rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
@@ -5407,6 +5422,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
iter->next_event = iter->head;
+ iter->missed_events = 0;
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
@@ -6086,10 +6102,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
*/
bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
{
- bool ret = iter->missed_events != 0;
-
- iter->missed_events = 0;
- return ret;
+ return iter->missed_events != 0;
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
@@ -6251,7 +6264,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
unsigned long flags;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-
+ iter->missed_events = 0;
rb_advance_iter(iter);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h
index 0bbfd2543329..78fca873d61e 100644
--- a/kernel/trace/rv/monitors/deadline/deadline.h
+++ b/kernel/trace/rv/monitors/deadline/deadline.h
@@ -95,7 +95,8 @@ static inline u8 get_server_type(struct task_struct *tsk)
static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out)
{
size_t size = offsetofend(struct sched_attr, sched_flags);
- struct sched_attr __user *uattr, attr;
+ struct sched_attr __user *uattr;
+ struct sched_attr attr;
int new_policy = -1, ret;
unsigned long args[6];
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c
index 31f90f3638d8..8ead8783c29f 100644
--- a/kernel/trace/rv/monitors/nomiss/nomiss.c
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.c
@@ -227,7 +227,7 @@ static int enable_nomiss(void)
{
int retval;
- retval = da_monitor_init();
+ retval = ha_monitor_init();
if (retval)
return retval;
@@ -263,7 +263,7 @@ static void disable_nomiss(void)
rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch);
rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
- da_monitor_destroy();
+ ha_monitor_destroy();
}
static struct rv_monitor rv_this = {
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
index 4594c7c46601..3b6a85e815b8 100644
--- a/kernel/trace/rv/monitors/opid/opid.c
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -22,14 +22,8 @@ static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns
if (env == irq_off_opid)
return irqs_disabled();
else if (env == preempt_off_opid) {
- /*
- * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
- * preemption (adding one to the preempt_count). Since we are
- * interested in the preempt_count at the time the tracepoint was
- * hit, we consider 1 as still enabled.
- */
if (IS_ENABLED(CONFIG_PREEMPTION))
- return (preempt_count() & PREEMPT_MASK) > 1;
+ return (preempt_count() & PREEMPT_MASK) > 0;
return true;
}
return ENV_INVALID_VALUE;
@@ -73,7 +67,7 @@ static int enable_opid(void)
{
int retval;
- retval = da_monitor_init();
+ retval = ha_monitor_init();
if (retval)
return retval;
@@ -90,7 +84,7 @@ static void disable_opid(void)
rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
- da_monitor_destroy();
+ ha_monitor_destroy();
}
/*
diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c
index 9ccfda6b0e73..3c38fb1a0159 100644
--- a/kernel/trace/rv/monitors/stall/stall.c
+++ b/kernel/trace/rv/monitors/stall/stall.c
@@ -103,7 +103,7 @@ static int enable_stall(void)
{
int retval;
- retval = da_monitor_init();
+ retval = ha_monitor_init();
if (retval)
return retval;
@@ -120,7 +120,7 @@ static void disable_stall(void)
rv_detach_trace_probe("stall", sched_switch, handle_sched_switch);
rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
- da_monitor_destroy();
+ ha_monitor_destroy();
}
static struct rv_monitor rv_this = {
diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
index 02af2297ae5a..f4642f5adda3 100644
--- a/kernel/trace/simple_ring_buffer.c
+++ b/kernel/trace/simple_ring_buffer.c
@@ -395,7 +395,6 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta));
cpu_buffer->meta->meta_page_size = PAGE_SIZE;
- cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
/* The reader page is not part of the ring initially */
page = load_page(desc->page_va[0]);
@@ -431,12 +430,13 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
if (ret) {
for (i--; i >= 0; i--)
- unload_page((void *)desc->page_va[i]);
+ unload_page(bpages[i].page);
unload_page(cpu_buffer->meta);
return ret;
}
+ cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
/* Close the ring */
bpage->link.next = &cpu_buffer->tail_page->link;
cpu_buffer->tail_page->link.prev = &bpage->link;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0dbbf6cca9bc..eb2c2bc8bc3d 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1369,10 +1369,8 @@ static const char *hist_field_name(struct hist_field *field,
len = snprintf(full_name, sizeof(full_name), fmt,
field->system, field->event_name,
field->name);
- if (len >= sizeof(full_name))
- return NULL;
-
- field_name = full_name;
+ if (len < sizeof(full_name))
+ field_name = full_name;
} else
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 75678053b21c..5e83c4f6f2b4 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -83,6 +83,22 @@ struct osnoise_instance {
static struct list_head osnoise_instances;
+static void osnoise_print(const char *fmt, ...)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+ va_list ap;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ va_start(ap, fmt);
+ trace_array_vprintk(tr, _RET_IP_, fmt, ap);
+ va_end(ap);
+ }
+ rcu_read_unlock();
+}
+
static bool osnoise_has_registered_instances(void)
{
return !!list_first_or_null_rcu(&osnoise_instances,
@@ -123,6 +139,7 @@ static int osnoise_register_instance(struct trace_array *tr)
* trace_types_lock.
*/
lockdep_assert_held(&trace_types_lock);
+ trace_array_init_printk(tr);
inst = kmalloc_obj(*inst);
if (!inst)
@@ -471,15 +488,7 @@ static void print_osnoise_headers(struct seq_file *s)
* osnoise_taint - report an osnoise error.
*/
#define osnoise_taint(msg) ({ \
- struct osnoise_instance *inst; \
- struct trace_buffer *buffer; \
- \
- rcu_read_lock(); \
- list_for_each_entry_rcu(inst, &osnoise_instances, list) { \
- buffer = inst->tr->array_buffer.buffer; \
- trace_array_printk_buf(buffer, _THIS_IP_, msg); \
- } \
- rcu_read_unlock(); \
+ osnoise_print(msg); \
osnoise_data.tainted = true; \
})
@@ -1189,10 +1198,10 @@ static __always_inline void osnoise_stop_exception(char *msg, int cpu)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
- "stop tracing hit on cpu %d due to exception: %s\n",
- smp_processor_id(),
- msg);
+ trace_array_printk(tr, _THIS_IP_,
+ "stop tracing hit on cpu %d due to exception: %s\n",
+ smp_processor_id(),
+ msg);
if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
panic("tracer hit on cpu %d due to exception: %s\n",
@@ -1362,8 +1371,8 @@ static __always_inline void osnoise_stop_tracing(void)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
- "stop tracing hit on cpu %d\n", smp_processor_id());
+ trace_array_printk(tr, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
panic("tracer hit stop condition on CPU %d\n", smp_processor_id());
@@ -2544,9 +2553,12 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
notify_new_max_latency(diff);
tlat->tracing_thread = false;
- if (osnoise_data.stop_tracing_total)
- if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
+ if (osnoise_data.stop_tracing_total) {
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
+ timerlat_dump_stack(time_to_us(diff));
osnoise_stop_tracing();
+ }
+ }
} else {
tlat->tracing_thread = false;
tlat->kthread = current;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e0d3a0da26af..fd1caa1f9723 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -332,6 +332,23 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code,
return -ENOENT;
}
+static int parse_trace_event(char *arg, struct fetch_insn *code,
+ struct traceprobe_parse_context *ctx)
+{
+ int ret;
+
+ if (code->data)
+ return -EFAULT;
+ ret = parse_trace_event_arg(arg, code, ctx);
+ if (!ret)
+ return 0;
+ if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
+ code->op = FETCH_OP_COMM;
+ return 0;
+ }
+ return -EINVAL;
+}
+
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
static u32 btf_type_int(const struct btf_type *t)
@@ -376,11 +393,16 @@ static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type)
&& BTF_INT_BITS(intdata) == 8;
}
+static struct btf *ctx_btf(struct traceprobe_parse_context *ctx)
+{
+ return ctx->struct_btf ? : ctx->btf;
+}
+
static int check_prepare_btf_string_fetch(char *typename,
struct fetch_insn **pcode,
struct traceprobe_parse_context *ctx)
{
- struct btf *btf = ctx->btf;
+ struct btf *btf = ctx_btf(ctx);
if (!btf || !ctx->last_type)
return 0;
@@ -506,6 +528,15 @@ static int query_btf_context(struct traceprobe_parse_context *ctx)
return 0;
}
+static void clear_struct_btf(struct traceprobe_parse_context *ctx)
+{
+ if (ctx->struct_btf) {
+ btf_put(ctx->struct_btf);
+ ctx->struct_btf = NULL;
+ ctx->last_struct = NULL;
+ }
+}
+
static void clear_btf_context(struct traceprobe_parse_context *ctx)
{
if (ctx->btf) {
@@ -554,22 +585,29 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
struct fetch_insn *code = *pcode;
const struct btf_member *field;
u32 bitoffs, anon_offs;
+ bool is_struct = ctx->struct_btf != NULL;
+ struct btf *btf = ctx_btf(ctx);
char *next;
int is_ptr;
s32 tid;
do {
- /* Outer loop for solving arrow operator ('->') */
- if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
- trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
- return -EINVAL;
- }
- /* Convert a struct pointer type to a struct type */
- type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
- if (!type) {
- trace_probe_log_err(ctx->offset, BAD_BTF_TID);
- return -EINVAL;
+ if (!is_struct) {
+ /* Outer loop for solving arrow operator ('->') */
+ if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
+ trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+
+ /* Convert a struct pointer type to a struct type */
+ type = btf_type_skip_modifiers(btf, type->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
}
+ /* Only the first type can skip being a pointer */
+ is_struct = false;
bitoffs = 0;
do {
@@ -580,7 +618,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
return is_ptr;
anon_offs = 0;
- field = btf_find_struct_member(ctx->btf, type, fieldname,
+ field = btf_find_struct_member(btf, type, fieldname,
&anon_offs);
if (IS_ERR(field)) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
@@ -602,7 +640,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
ctx->last_bitsize = 0;
}
- type = btf_type_skip_modifiers(ctx->btf, field->type, &tid);
+ type = btf_type_skip_modifiers(btf, field->type, &tid);
if (!type) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
return -EINVAL;
@@ -640,7 +678,7 @@ static int parse_btf_arg(char *varname,
int i, is_ptr, ret;
u32 tid;
- if (WARN_ON_ONCE(!ctx->funcname))
+ if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT)))
return -EINVAL;
is_ptr = split_next_field(varname, &field, ctx);
@@ -653,6 +691,19 @@ static int parse_btf_arg(char *varname,
return -EOPNOTSUPP;
}
+ if (ctx->flags & TPARG_FL_TEVENT) {
+ ret = parse_trace_event(varname, code, ctx);
+ if (ret < 0) {
+ trace_probe_log_err(ctx->offset, BAD_ATTACH_ARG);
+ return ret;
+ }
+ /* TEVENT is only here via a typecast */
+ if (WARN_ON_ONCE(ctx->struct_btf == NULL))
+ return -EINVAL;
+ type = ctx->last_struct;
+ goto found_type;
+ }
+
if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
code->op = FETCH_OP_RETVAL;
/* Check whether the function return type is not void */
@@ -709,6 +760,7 @@ static int parse_btf_arg(char *varname,
found:
type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+found_type:
if (!type) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
return -EINVAL;
@@ -727,7 +779,7 @@ found:
static const struct fetch_type *find_fetch_type_from_btf_type(
struct traceprobe_parse_context *ctx)
{
- struct btf *btf = ctx->btf;
+ struct btf *btf = ctx_btf(ctx);
const char *typestr = NULL;
if (btf && ctx->last_type)
@@ -758,7 +810,67 @@ static int parse_btf_bitfield(struct fetch_insn **pcode,
return 0;
}
-#else
+static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
+{
+ struct btf *btf = NULL;
+ int id;
+
+ /* A struct_btf should only be used by a single argument */
+ if (WARN_ON_ONCE(ctx->struct_btf)) {
+ btf_put(ctx->struct_btf);
+ ctx->struct_btf = NULL;
+ }
+
+ id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf);
+ if (id < 0)
+ return id;
+ ctx->struct_btf = btf;
+ ctx->last_struct = btf_type_by_id(ctx->struct_btf, id);
+ return 0;
+}
+
+static int handle_typecast(char *arg, struct fetch_insn **pcode,
+ struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ char *tmp;
+ int ret;
+
+ /* Currently this only works for eprobes */
+ if (!(ctx->flags & TPARG_FL_TEVENT)) {
+ trace_probe_log_err(ctx->offset, TYPECAST_NOT_EVENT);
+ return -EINVAL;
+ }
+
+ tmp = strchr(arg, ')');
+ if (!tmp) {
+ trace_probe_log_err(ctx->offset + strlen(arg),
+ DEREF_OPEN_BRACE);
+ return -EINVAL;
+ }
+ *tmp = '\0';
+ ret = query_btf_struct(arg + 1, ctx);
+ *tmp = ')';
+
+ if (ret < 0) {
+ trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+
+ tmp++;
+
+ ctx->offset += tmp - arg;
+ ret = parse_btf_arg(tmp, pcode, end, ctx);
+ return ret;
+}
+
+#else /* !CONFIG_PROBE_EVENTS_BTF_ARGS */
+
+static void clear_struct_btf(struct traceprobe_parse_context *ctx)
+{
+ ctx->struct_btf = NULL;
+}
+
static void clear_btf_context(struct traceprobe_parse_context *ctx)
{
ctx->btf = NULL;
@@ -794,7 +906,15 @@ static int check_prepare_btf_string_fetch(char *typename,
return 0;
}
-#endif
+static int handle_typecast(char *arg, struct fetch_insn **pcode,
+ struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_PROBE_EVENTS_BTF_ARGS */
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
@@ -838,15 +958,10 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum)
int i, offset, last_offset = 0;
if (!earg) {
- earg = kzalloc_obj(*tp->entry_arg);
+ earg = kzalloc_flex(*earg, code, 2 * tp->nr_args + 1);
if (!earg)
return -ENOMEM;
earg->size = 2 * tp->nr_args + 1;
- earg->code = kzalloc_objs(struct fetch_insn, earg->size);
- if (!earg->code) {
- kfree(earg);
- return -ENOMEM;
- }
/* Fill the code buffer with 'end' to simplify it */
for (i = 0; i < earg->size; i++)
earg->code[i].op = FETCH_OP_END;
@@ -953,18 +1068,9 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
int len;
if (ctx->flags & TPARG_FL_TEVENT) {
- if (code->data)
- return -EFAULT;
- ret = parse_trace_event_arg(arg, code, ctx);
- if (!ret)
- return 0;
- if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
- code->op = FETCH_OP_COMM;
- return 0;
- }
- /* backward compatibility */
- ctx->offset = 0;
- goto inval;
+ if (parse_trace_event(arg, code, ctx) < 0)
+ goto inval;
+ return 0;
}
if (str_has_prefix(arg, "retval")) {
@@ -1231,6 +1337,9 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->op = FETCH_OP_IMM;
}
break;
+ case '(':
+ ret = handle_typecast(arg, pcode, end, ctx);
+ break;
default:
if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
if (!tparg_is_function_entry(ctx->flags) &&
@@ -1563,6 +1672,9 @@ fail:
}
kfree(tmp);
+ /* struct_btf should not be passed to other arguments */
+ clear_struct_btf(ctx);
+
return ret;
}
@@ -2051,7 +2163,6 @@ void trace_probe_cleanup(struct trace_probe *tp)
traceprobe_free_probe_arg(&tp->args[i]);
if (tp->entry_arg) {
- kfree(tp->entry_arg->code);
kfree(tp->entry_arg);
tp->entry_arg = NULL;
}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 262d8707a3df..15758cc11fc6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -238,8 +238,8 @@ struct probe_arg {
};
struct probe_entry_arg {
- struct fetch_insn *code;
unsigned int size; /* The entry data size */
+ struct fetch_insn code[] __counted_by(size);
};
struct trace_uprobe_filter {
@@ -422,7 +422,9 @@ struct traceprobe_parse_context {
const struct btf_param *params; /* Parameter of the function */
s32 nr_params; /* The number of the parameters */
struct btf *btf; /* The BTF to be used */
+ struct btf *struct_btf; /* The BTF to be used for structs */
const struct btf_type *last_type; /* Saved type */
+ const struct btf_type *last_struct; /* Saved structure */
u32 last_bitoffs; /* Saved bitoffs */
u32 last_bitsize; /* Saved bitsize */
struct trace_probe *tp;
@@ -563,7 +565,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
C(TOO_MANY_ARGS, "Too many arguments are specified"), \
C(TOO_MANY_EARGS, "Too many entry arguments specified"), \
- C(EVENT_TOO_BIG, "Event too big (too many fields?)"),
+ C(EVENT_TOO_BIG, "Event too big (too many fields?)"), \
+ C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8ad72e17d8eb..e98ee7e1e66f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+static int perf_call_bpf_enter(struct trace_event_call *call,
struct syscall_metadata *sys_data,
- struct syscall_trace_enter *rec)
+ int syscall_nr, unsigned long *args)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
+ struct pt_regs regs = {};
int i;
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
- /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
- perf_fetch_caller_regs(regs);
- *(struct pt_regs **)&param = regs;
- param.syscall_nr = rec->nr;
+ /* bpf prog requires 'regs' to be the first member in the ctx */
+ perf_fetch_caller_regs(&regs);
+ *(struct pt_regs **)&param = &regs;
+ param.syscall_nr = syscall_nr;
for (i = 0; i < sys_data->nb_args; i++)
- param.args[i] = rec->args[i];
- return trace_call_bpf(call, &param);
+ param.args[i] = args[i];
+ return trace_call_bpf_faultable(call, &param);
}
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
- struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int size = 0;
int uargs = 0;
- /*
- * Syscall probe called with preemption enabled, but the ring
- * buffer and per-cpu data require preemption to be disabled.
- */
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_get_arguments(current, regs, args);
+ /*
+ * Run BPF program in faultable context before per-cpu buffer
+ * allocation, allowing sleepable BPF programs to execute.
+ */
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (valid_prog_array &&
+ !perf_call_bpf_enter(sys_data->enter_event, sys_data,
+ syscall_nr, args))
+ return;
+
+ /*
+ * Per-cpu ring buffer and perf event list operations require
+ * preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ if (hlist_empty(head))
+ return;
+
/* Check if this syscall event faults in user space memory */
mayfault = sys_data->user_mask != 0;
@@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
}
- head = this_cpu_ptr(sys_data->enter_event->perf_events);
- valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
- if (!valid_prog_array && hlist_empty(head))
- return;
-
/* get the size after alignment with the u32 buffer size field */
size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
@@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (mayfault)
syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
- if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
- hlist_empty(head)) {
- perf_swevent_put_recursion_context(rctx);
- return;
- }
-
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL);
@@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call)
syscall_fault_buffer_disable();
}
-static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
- struct syscall_trace_exit *rec)
+static int perf_call_bpf_exit(struct trace_event_call *call,
+ int syscall_nr, long ret_val)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long ret;
} __aligned(8) param;
-
- /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
- perf_fetch_caller_regs(regs);
- *(struct pt_regs **)&param = regs;
- param.syscall_nr = rec->nr;
- param.ret = rec->ret;
- return trace_call_bpf(call, &param);
+ struct pt_regs regs = {};
+
+ /* bpf prog requires 'regs' to be the first member in the ctx */
+ perf_fetch_caller_regs(&regs);
+ *(struct pt_regs **)&param = &regs;
+ param.syscall_nr = syscall_nr;
+ param.ret = ret_val;
+ return trace_call_bpf_faultable(call, &param);
}
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
- struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
- /*
- * Syscall probe called with preemption enabled, but the ring
- * buffer and per-cpu data require preemption to be disabled.
- */
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ /*
+ * Run BPF program in faultable context before per-cpu buffer
+ * allocation, allowing sleepable BPF programs to execute.
+ */
valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
- if (!valid_prog_array && hlist_empty(head))
+ if (valid_prog_array &&
+ !perf_call_bpf_exit(sys_data->exit_event, syscall_nr,
+ syscall_get_return_value(current, regs)))
+ return;
+
+ /*
+ * Per-cpu ring buffer and perf event list operations require
+ * preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ if (hlist_empty(head))
return;
/* We can probably do that at build time */
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
- hlist_empty(head)) {
- perf_swevent_put_recursion_context(rctx);
- return;
- }
-
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL);
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2cabf8a23ec5..c274346853d1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -912,7 +912,7 @@ static int uprobe_buffer_enable(void)
{
int ret = 0;
- BUG_ON(!mutex_is_locked(&event_mutex));
+ lockdep_assert_held(&event_mutex);
if (uprobe_buffer_refcnt++ == 0) {
ret = uprobe_buffer_init();
@@ -927,7 +927,7 @@ static void uprobe_buffer_disable(void)
{
int cpu;
- BUG_ON(!mutex_is_locked(&event_mutex));
+ lockdep_assert_held(&event_mutex);
if (--uprobe_buffer_refcnt == 0) {
for_each_possible_cpu(cpu)
@@ -979,6 +979,7 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
ucb = uprobe_buffer_get();
ucb->dsize = tu->tp.size + dsize;
+ BUILD_BUG_ON(MAX_UCB_BUFFER_SIZE < MAX_PROBE_EVENT_SIZE);
if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) {
ucb->dsize = MAX_UCB_BUFFER_SIZE;
dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index bf1a507695b6..0dd7927df22a 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
}
}
-static void tracing_map_elt_free(struct tracing_map_elt *elt)
+static void __tracing_map_elt_free(struct tracing_map_elt *elt)
{
if (!elt)
return;
- if (elt->map->ops && elt->map->ops->elt_free)
- elt->map->ops->elt_free(elt);
kfree(elt->fields);
kfree(elt->vars);
kfree(elt->var_set);
@@ -400,6 +398,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
kfree(elt);
}
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+ if (!elt)
+ return;
+
+ /* Only objects initialized with alloc_elt() should be passed to free_elt().*/
+ if (elt->map->ops && elt->map->ops->elt_free)
+ elt->map->ops->elt_free(elt);
+ __tracing_map_elt_free(elt);
+}
+
static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
{
struct tracing_map_elt *elt;
@@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
}
return elt;
free:
- tracing_map_elt_free(elt);
+ __tracing_map_elt_free(elt);
return ERR_PTR(err);
}
diff --git a/kernel/umh.c b/kernel/umh.c
index cffda97d961c..48117c569e1a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -430,7 +430,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
- queue_work(system_unbound_wq, &sub_info->work);
+ queue_work(system_dfl_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f747f241a5f..78068ae8f28a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2281,6 +2281,14 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
unsigned int req_cpu = cpu;
/*
+ * NOTE: Check whether the used workqueue is deprecated and warn
+ */
+ if (unlikely(wq->flags & __WQ_DEPRECATED))
+ pr_warn_once("workqueue: work func %ps enqueued on deprecated workqueue. "
+ "Use system_{percpu|dfl}_wq instead.\n",
+ work->func);
+
+ /*
* While a work item is PENDING && off queue, a task trying to
* steal the PENDING will busy-loop waiting for it to either get
* queued or lose PENDING. Grabbing PENDING and queueing should
@@ -2296,6 +2304,18 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n",
work->func, wq->name))) {
+ struct work_offq_data offqd;
+
+ /*
+ * State on entry: PENDING is set, work is off-queue (no
+ * insert_work() has run).
+ *
+ * Returning without clearing PENDING would leave the work
+ * in a weird state (PENDING=1, PWQ=0, entry empty)
+ */
+ work_offqd_unpack(&offqd, *work_data_bits(work));
+ set_work_pool_and_clear_pending(work, offqd.pool_id,
+ work_offqd_pack_flags(&offqd));
return;
}
rcu_read_lock();
@@ -5300,16 +5320,6 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
return pwq;
}
-static void apply_wqattrs_lock(void)
-{
- mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
- mutex_unlock(&wq_pool_mutex);
-}
-
/**
* wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
* @attrs: the wq_attrs of the default pwq of the target workqueue
@@ -5642,7 +5652,9 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
}
- return ret;
+ if (ret)
+ goto enomem;
+ return 0;
enomem:
if (wq->cpu_pwq) {
@@ -5804,7 +5816,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
- flags |= WQ_UNBOUND;
+ flags = (flags & ~WQ_PERCPU) | WQ_UNBOUND;
/* allocate wq and format name */
if (flags & WQ_UNBOUND)
@@ -5828,6 +5840,23 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
wq->name);
+ /*
+ * One among WQ_PERCPU and WQ_UNBOUND must be set, but not both.
+ * - If neither is set, default to WQ_PERCPU
+ * - If both are set, default to WQ_UNBOUND
+ *
+ * This code can be removed after workqueue are unbound by default
+ */
+ if (unlikely(!(flags & (WQ_UNBOUND | WQ_PERCPU)))) {
+ WARN_ONCE(1, "workqueue: %s is using neither WQ_PERCPU or WQ_UNBOUND. "
+ "Setting WQ_PERCPU.\n", wq->name);
+ flags |= WQ_PERCPU;
+ } else if (unlikely((flags & WQ_PERCPU) && (flags & WQ_UNBOUND))) {
+ WARN_ONCE(1, "workqueue: %s uses both WQ_PERCPU and WQ_UNBOUND. "
+ "Dropped WQ_PERCPU, keeping WQ_UNBOUND.\n", wq->name);
+ flags &= ~WQ_PERCPU;
+ }
+
if (flags & WQ_BH) {
/*
* BH workqueues always share a single execution context per CPU
@@ -5863,7 +5892,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
* wq_pool_mutex protects the workqueues list, allocations of PWQs,
* and the global freeze state.
*/
- apply_wqattrs_lock();
+ mutex_lock(&wq_pool_mutex);
if (alloc_and_link_pwqs(wq) < 0)
goto err_unlock_free_node_nr_active;
@@ -5877,7 +5906,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
if (wq_online && init_rescuer(wq) < 0)
goto err_unlock_destroy;
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
goto err_destroy;
@@ -5885,7 +5914,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
return wq;
err_unlock_free_node_nr_active:
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
/*
* Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
* flushing the pwq_release_worker ensures that the pwq_release_workfn()
@@ -5900,12 +5929,27 @@ err_free_wq:
kfree(wq);
return NULL;
err_unlock_destroy:
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
err_destroy:
destroy_workqueue(wq);
return NULL;
}
+__printf(1, 0)
+static struct workqueue_struct *alloc_workqueue_va(const char *fmt,
+ unsigned int flags,
+ int max_active,
+ va_list args)
+{
+ struct workqueue_struct *wq;
+
+ wq = __alloc_workqueue(fmt, flags, max_active, args);
+ if (wq)
+ wq_init_lockdep(wq);
+
+ return wq;
+}
+
__printf(1, 4)
struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
unsigned int flags,
@@ -5915,12 +5959,8 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
va_list args;
va_start(args, max_active);
- wq = __alloc_workqueue(fmt, flags, max_active, args);
+ wq = alloc_workqueue_va(fmt, flags, max_active, args);
va_end(args);
- if (!wq)
- return NULL;
-
- wq_init_lockdep(wq);
return wq;
}
@@ -5932,15 +5972,15 @@ static void devm_workqueue_release(void *res)
}
__printf(2, 5) struct workqueue_struct *
-devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags,
- int max_active, ...)
+devm_alloc_workqueue_noprof(struct device *dev, const char *fmt,
+ unsigned int flags, int max_active, ...)
{
struct workqueue_struct *wq;
va_list args;
int ret;
va_start(args, max_active);
- wq = alloc_workqueue(fmt, flags, max_active, args);
+ wq = alloc_workqueue_va(fmt, flags, max_active, args);
va_end(args);
if (!wq)
return NULL;
@@ -5951,7 +5991,7 @@ devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags,
return wq;
}
-EXPORT_SYMBOL_GPL(devm_alloc_workqueue);
+EXPORT_SYMBOL_GPL(devm_alloc_workqueue_noprof);
#ifdef CONFIG_LOCKDEP
__printf(1, 5)
@@ -6285,7 +6325,7 @@ EXPORT_SYMBOL_GPL(set_worker_desc);
*/
void print_worker_info(const char *log_lvl, struct task_struct *task)
{
- work_func_t *fn = NULL;
+ work_func_t fn = NULL;
char name[WQ_NAME_LEN] = { };
char desc[WORKER_DESC_LEN] = { };
struct pool_workqueue *pwq = NULL;
@@ -7290,7 +7330,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
struct workqueue_attrs *attrs;
int ret = -ENOMEM;
- apply_wqattrs_lock();
+ mutex_lock(&wq_pool_mutex);
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
@@ -7303,7 +7343,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
ret = -EINVAL;
out_unlock:
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
free_workqueue_attrs(attrs);
return ret ?: count;
}
@@ -7329,7 +7369,7 @@ static ssize_t wq_cpumask_store(struct device *dev,
struct workqueue_attrs *attrs;
int ret = -ENOMEM;
- apply_wqattrs_lock();
+ mutex_lock(&wq_pool_mutex);
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
@@ -7340,7 +7380,7 @@ static ssize_t wq_cpumask_store(struct device *dev,
ret = apply_workqueue_attrs_locked(wq, attrs);
out_unlock:
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
free_workqueue_attrs(attrs);
return ret ?: count;
}
@@ -7376,13 +7416,13 @@ static ssize_t wq_affn_scope_store(struct device *dev,
if (affn < 0)
return affn;
- apply_wqattrs_lock();
+ mutex_lock(&wq_pool_mutex);
attrs = wq_sysfs_prep_attrs(wq);
if (attrs) {
attrs->affn_scope = affn;
ret = apply_workqueue_attrs_locked(wq, attrs);
}
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
free_workqueue_attrs(attrs);
return ret ?: count;
}
@@ -7407,13 +7447,13 @@ static ssize_t wq_affinity_strict_store(struct device *dev,
if (sscanf(buf, "%d", &v) != 1)
return -EINVAL;
- apply_wqattrs_lock();
+ mutex_lock(&wq_pool_mutex);
attrs = wq_sysfs_prep_attrs(wq);
if (attrs) {
attrs->affn_strict = (bool)v;
ret = apply_workqueue_attrs_locked(wq, attrs);
}
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
free_workqueue_attrs(attrs);
return ret ?: count;
}
@@ -7454,12 +7494,12 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
ret = 0;
- apply_wqattrs_lock();
+ mutex_lock(&wq_pool_mutex);
if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
if (!ret)
cpumask_copy(wq_requested_unbound_cpumask, cpumask);
- apply_wqattrs_unlock();
+ mutex_unlock(&wq_pool_mutex);
}
return ret;
@@ -8012,12 +8052,12 @@ void __init workqueue_init_early(void)
ordered_wq_attrs[i] = attrs;
}
- system_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+ system_wq = alloc_workqueue("events", WQ_PERCPU | __WQ_DEPRECATED, 0);
system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);
system_highpri_wq = alloc_workqueue("events_highpri",
WQ_HIGHPRI | WQ_PERCPU, 0);
system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0);
- system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
+ system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND | __WQ_DEPRECATED, WQ_MAX_ACTIVE);
system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE | WQ_PERCPU, 0);
@@ -8187,11 +8227,7 @@ static bool __init cpus_dont_share(int cpu0, int cpu1)
static bool __init cpus_share_smt(int cpu0, int cpu1)
{
-#ifdef CONFIG_SCHED_SMT
return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
-#else
- return false;
-#endif
}
static bool __init cpus_share_numa(int cpu0, int cpu1)