diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-04-20 16:27:33 -0700 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-04-20 16:29:51 -0700 |
commit | 681c5b51dc6b8ff1ec05555243eccf64a08cb2fd (patch) | |
tree | 713e0523bc617cbda9ff930568dd2341907b87b8 /kernel | |
parent | e315e7b83a22043bffee450437d7089ef373cbf6 (diff) | |
parent | 0f2a4af27b649c13ba76431552fe49c60120d0f6 (diff) | |
download | lwn-681c5b51dc6b8ff1ec05555243eccf64a08cb2fd.tar.gz lwn-681c5b51dc6b8ff1ec05555243eccf64a08cb2fd.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Adjacent changes:
net/mptcp/protocol.h
63740448a32e ("mptcp: fix accept vs worker race")
2a6a870e44dd ("mptcp: stops worker on unaccepted sockets at listener close")
ddb1a072f858 ("mptcp: move first subflow allocation at mpc access time")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/verifier.c | 15 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 178 | ||||
-rw-r--r-- | kernel/cgroup/legacy_freezer.c | 7 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 10 | ||||
-rw-r--r-- | kernel/sys.c | 69 |
7 files changed, 216 insertions, 68 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6db6de3e9ea..6e12cc296ad8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3243,6 +3243,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, } } else if (opcode == BPF_EXIT) { return -ENOTSUPP; + } else if (BPF_SRC(insn->code) == BPF_X) { + if (!(*reg_mask & (dreg | sreg))) + return 0; + /* dreg <cond> sreg + * Both dreg and sreg need precision before + * this insn. If only sreg was marked precise + * before it would be equally necessary to + * propagate it to dreg. + */ + *reg_mask |= (sreg | dreg); + /* else dreg <cond> K + * Only dreg still needs precision before + * this insn, so for the K-based conditional + * there is nothing new to be marked. + */ } } else if (class == BPF_LD) { if (!(*reg_mask & dreg)) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 636f1c682ac0..505d86b16642 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1513,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); if (adding || deleting) - update_tasks_cpumask(parent, tmp->new_cpus); + update_tasks_cpumask(parent, tmp->addmask); /* * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1770,10 +1770,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* * Use the cpumasks in trialcs for tmpmasks when they are pointers * to allocated cpumasks. + * + * Note that update_parent_subparts_cpumask() uses only addmask & + * delmask, but not new_cpus. */ tmp.addmask = trialcs->subparts_cpus; tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = trialcs->cpus_allowed; + tmp.new_cpus = NULL; #endif retval = validate_change(cs, trialcs); @@ -1838,6 +1841,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } spin_unlock_irq(&callback_lock); +#ifdef CONFIG_CPUMASK_OFFSTACK + /* Now trialcs->cpus_allowed is available */ + tmp.new_cpus = trialcs->cpus_allowed; +#endif + /* effective_cpus will be updated here */ update_cpumasks_hier(cs, &tmp, false); @@ -2445,6 +2453,20 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; +/* + * Check to see if a cpuset can accept a new task + * For v1, cpus_allowed and mems_allowed can't be empty. + * For v2, effective_cpus can't be empty. + * Note that in v1, effective_cpus = cpus_allowed. + */ +static int cpuset_can_attach_check(struct cpuset *cs) +{ + if (cpumask_empty(cs->effective_cpus) || + (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) + return -ENOSPC; + return 0; +} + /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { @@ -2459,16 +2481,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) percpu_down_write(&cpuset_rwsem); - /* allow moving tasks into an empty cpuset if on default hierarchy */ - ret = -ENOSPC; - if (!is_in_v2_mode() && - (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) - goto out_unlock; - - /* - * Task cannot be moved to a cpuset with empty effective cpus. - */ - if (cpumask_empty(cs->effective_cpus)) + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { @@ -2485,7 +2500,6 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; - ret = 0; out_unlock: percpu_up_write(&cpuset_rwsem); return ret; @@ -2494,25 +2508,47 @@ out_unlock: static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; + struct cpuset *cs; cgroup_taskset_first(tset, &css); + cs = css_cs(css); percpu_down_write(&cpuset_rwsem); - css_cs(css)->attach_in_progress--; + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); } /* - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_to; + +static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) +{ + percpu_rwsem_assert_held(&cpuset_rwsem); + + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), + cs->subparts_cpus); + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flags(cs, task); +} static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_rwsem */ - static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; @@ -2543,20 +2579,8 @@ static void cpuset_attach(struct cgroup_taskset *tset) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { - if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); - else - cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); - /* - * can_attach beforehand should guarantee that this doesn't - * fail. TODO: have a better way to handle failure here - */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); - - cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); - } + cgroup_taskset_for_each(task, css, tset) + cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may @@ -3248,17 +3272,101 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } /* + * In case the child is cloned into a cpuset different from its parent, + * additional checks are done to see if the move is allowed. + */ +static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + int ret; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return 0; + + lockdep_assert_held(&cgroup_mutex); + percpu_down_write(&cpuset_rwsem); + + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) + goto out_unlock; + + ret = task_can_attach(task, cs->effective_cpus); + if (ret) + goto out_unlock; + + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; +out_unlock: + percpu_up_write(&cpuset_rwsem); + return ret; +} + +static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return; + + percpu_down_write(&cpuset_rwsem); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + percpu_up_write(&cpuset_rwsem); +} + +/* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ static void cpuset_fork(struct task_struct *task) { - if (task_css_is_root(task, cpuset_cgrp_id)) + struct cpuset *cs; + bool same_cs; + + rcu_read_lock(); + cs = task_cs(task); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) { + if (cs == &top_cpuset) + return; + + set_cpus_allowed_ptr(task, current->cpus_ptr); + task->mems_allowed = current->mems_allowed; return; + } + + /* CLONE_INTO_CGROUP */ + percpu_down_write(&cpuset_rwsem); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cpuset_attach_task(cs, task); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); - set_cpus_allowed_ptr(task, current->cpus_ptr); - task->mems_allowed = current->mems_allowed; + percpu_up_write(&cpuset_rwsem); } struct cgroup_subsys cpuset_cgrp_subsys = { @@ -3271,6 +3379,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .can_fork = cpuset_can_fork, + .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, .legacy_cftypes = legacy_files, .dfl_cftypes = dfl_files, diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 1b6b21851e9d..936473203a6b 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -22,6 +22,7 @@ #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/mutex.h> +#include <linux/cpu.h> /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is @@ -350,7 +351,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -361,7 +362,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (!(freezer->state & CGROUP_FREEZING)) { freezer->state &= ~CGROUP_FROZEN; if (was_freezing) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); unfreeze_cgroup(freezer); } } @@ -379,6 +380,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; + cpus_read_lock(); /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as @@ -407,6 +409,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) } rcu_read_unlock(); mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static ssize_t freezer_write(struct kernfs_open_file *of, diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 831f1f472bb8..0a2b4967e333 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -457,9 +457,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) struct task_cputime *cputime = &bstat->cputime; int i; - cputime->stime = 0; - cputime->utime = 0; - cputime->sum_exec_runtime = 0; + memset(bstat, 0, sizeof(*bstat)); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; diff --git a/kernel/fork.c b/kernel/fork.c index 0c92f224c68c..ea332319dffe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + destroy_context(mm); fail_nocontext: mm_free_pgd(mm); fail_nopgd: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6986ea31c984..5f6587d94c1d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10238,6 +10238,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; + + /* + * If the local group is more loaded than the average system + * load, don't try to pull any tasks. + */ + if (local->avg_load >= sds->avg_load) { + env->imbalance = 0; + return; + } + } /* diff --git a/kernel/sys.c b/kernel/sys.c index 495cd87d9bf4..351de7916302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) struct cred *new; int retval; kuid_t kruid, keuid, ksuid; + bool ruid_new, euid_new, suid_new; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); @@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if ((suid != (uid_t) -1) && !uid_valid(ksuid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) && + (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) && + uid_eq(keuid, old->fsuid))) && + (suid == (uid_t) -1 || uid_eq(ksuid, old->suid))) + return 0; + + ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid); + euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid); + suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid); + if ((ruid_new || euid_new || suid_new) && + !ns_capable_setid(old->user_ns, CAP_SETUID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { - if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && - !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) - goto error; - if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && - !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) - goto error; - if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && - !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) - goto error; - } - if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(kruid, old->uid)) { @@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) struct cred *new; int retval; kgid_t krgid, kegid, ksgid; + bool rgid_new, egid_new, sgid_new; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); @@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) && + (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) && + gid_eq(kegid, old->fsgid))) && + (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid))) + return 0; + + rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid); + egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid); + sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid); + if ((rgid_new || egid_new || sgid_new) && + !ns_capable_setid(old->user_ns, CAP_SETGID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { - if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && - !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) - goto error; - if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && - !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) - goto error; - if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && - !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) - goto error; - } if (rgid != (gid_t) -1) new->gid = krgid; |