summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c318
1 files changed, 182 insertions, 136 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e5a6bf587f9..cfaca3040b2f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,7 +91,6 @@
#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
-#include "stats.h"
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
@@ -119,7 +118,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#ifdef CONFIG_SCHED_DEBUG
/*
* Debugging: various feature bits
*
@@ -129,7 +127,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-const_debug unsigned int sysctl_sched_features =
+__read_mostly unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
@@ -143,13 +141,12 @@ const_debug unsigned int sysctl_sched_features =
*/
__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;
-#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
+__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
@@ -491,6 +488,16 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
+/* need a wrapper since we may need to trace from modules */
+EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
+
+/* Call via the helper macro trace_set_current_state. */
+void __trace_set_current_state(int state_value)
+{
+ trace_sched_set_state_tp(current, state_value);
+}
+EXPORT_SYMBOL(__trace_set_current_state);
+
/*
* Serialization rules:
*
@@ -740,39 +747,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ if (irqtime_enabled()) {
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}IRQ region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}IRQ
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}IRQ region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}IRQ
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
- delayacct_irq(rq->curr, irq_delta);
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ delayacct_irq(rq->curr, irq_delta);
+ }
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
@@ -789,22 +800,25 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
void update_rq_clock(struct rq *rq)
{
s64 delta;
+ u64 clock;
lockdep_assert_rq_held(rq);
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
-#ifdef CONFIG_SCHED_DEBUG
if (sched_feat(WARN_DOUBLE_CLOCK))
- SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
-#endif
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ clock = sched_clock_cpu(cpu_of(rq));
+ scx_rq_clock_update(rq, clock);
+
+ delta = clock - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
+
update_rq_clock_task(rq, delta);
}
@@ -908,8 +922,7 @@ static void hrtick_rq_init(struct rq *rq)
#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- rq->hrtick_timer.function = hrtick;
+ hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
@@ -1055,9 +1068,10 @@ void wake_up_q(struct wake_q_head *head)
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
- /* Task can safely be re-inserted now: */
node = node->next;
- task->wake_q.next = NULL;
+ /* pairs with cmpxchg_relaxed() in __wake_q_add() */
+ WRITE_ONCE(task->wake_q.next, NULL);
+ /* Task can safely be re-inserted now. */
/*
* wake_up_process() executes a full barrier, which pairs with
@@ -1168,13 +1182,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
guard(rcu)();
@@ -1189,7 +1203,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
return default_cpu;
}
@@ -1341,7 +1355,7 @@ bool sched_can_stop_tick(struct rq *rq)
if (scx_enabled() && !scx_can_stop_tick(rq))
return false;
- if (rq->cfs.h_nr_running > 1)
+ if (rq->cfs.h_nr_queued > 1)
return false;
/*
@@ -1711,7 +1725,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
bucket = &uc_rq->bucket[uc_se->bucket_id];
- SCHED_WARN_ON(!bucket->tasks);
+ WARN_ON_ONCE(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
@@ -1731,7 +1745,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fix up the expected value.
*/
- SCHED_WARN_ON(bucket->value > rq_clamp);
+ WARN_ON_ONCE(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
uclamp_rq_set(rq, clamp_id, bkt_clamp);
@@ -1748,7 +1762,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
- if (!static_branch_unlikely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return;
if (unlikely(!p->sched_class->uclamp_enabled))
@@ -1775,7 +1789,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
- if (!static_branch_unlikely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return;
if (unlikely(!p->sched_class->uclamp_enabled))
@@ -1933,12 +1947,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write,
}
if (update_root_tg) {
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
uclamp_sync_util_min_rt_default();
}
@@ -2113,7 +2127,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+ WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
@@ -2718,7 +2732,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
* XXX do further audits, this smells like something putrid.
*/
if (ctx->flags & SCA_MIGRATE_DISABLE)
- SCHED_WARN_ON(!p->on_cpu);
+ WARN_ON_ONCE(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
@@ -3283,7 +3297,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
-#ifdef CONFIG_SCHED_DEBUG
unsigned int state = READ_ONCE(p->__state);
/*
@@ -3321,7 +3334,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(!cpu_online(new_cpu));
WARN_ON_ONCE(is_migration_disabled(p));
-#endif
trace_sched_migrate_task(p, new_cpu);
@@ -3534,7 +3546,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
*
* More yuck to audit.
*/
- do_set_cpus_allowed(p, task_cpu_possible_mask(p));
+ do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
@@ -3913,13 +3925,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
- /*
- * The BPF scheduler may depend on select_task_rq() being invoked during
- * wakeups. In addition, @p may end up executing on a different CPU
- * regardless of what happens in the wakeup path making the ttwu_queue
- * optimization less meaningful. Skip if on SCX.
- */
- if (task_on_scx(p))
+ /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
+ if (!scx_allow_ttwu_queue(p))
return false;
/*
@@ -4187,7 +4194,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;
@@ -4481,7 +4488,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_LIST_HEAD(&p->se.group_node);
/* A delayed task cannot be in clone(). */
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -4646,7 +4653,7 @@ static int sysctl_schedstats(const struct ctl_table *table, int write, void *buf
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_core_sysctls[] = {
+static const struct ctl_table sched_core_sysctls[] = {
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
@@ -5298,6 +5305,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
finish_task_switch(prev);
+ /*
+ * This is a special case: the newly created task has just
+ * switched the context for the first time. It is returning from
+ * schedule for the first time in this path.
+ */
+ trace_sched_exit_tp(true, CALLER_ADDR0);
preempt_enable();
if (current->set_child_tid)
@@ -5569,7 +5582,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-#ifdef CONFIG_SCHED_DEBUG
static u64 cpu_resched_latency(struct rq *rq)
{
int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
@@ -5614,9 +5626,6 @@ static int __init setup_resched_latency_warn_ms(char *str)
return 1;
}
__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
-#else
-static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
-#endif /* CONFIG_SCHED_DEBUG */
/*
* This function gets called by the timer code, with HZ frequency.
@@ -5632,7 +5641,7 @@ void sched_tick(void)
unsigned long hw_pressure;
u64 resched_latency;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
arch_scale_freq_tick();
sched_clock_tick();
@@ -5737,7 +5746,7 @@ static void sched_tick_remote(struct work_struct *work)
* we are always sure that there is no proxy (only a
* single task is running).
*/
- SCHED_WARN_ON(rq->curr != rq->donor);
+ WARN_ON_ONCE(rq->curr != rq->donor);
update_rq_clock(rq);
if (!is_idle_task(curr)) {
@@ -5771,7 +5780,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5792,7 +5801,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5957,7 +5966,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
- SCHED_WARN_ON(ct_state() == CT_STATE_USER);
+ WARN_ON_ONCE(ct_state() == CT_STATE_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6018,7 +6027,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* opportunity to pull in more work from other CPUs.
*/
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
- rq->nr_running == rq->cfs.h_nr_running)) {
+ rq->nr_running == rq->cfs.h_nr_queued)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
@@ -6641,13 +6650,15 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
- bool block = false;
+ bool is_switch = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
struct rq *rq;
int cpu;
+ trace_sched_entry_tp(preempt, CALLER_ADDR0);
+
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
@@ -6702,7 +6713,7 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- block = try_to_block_task(rq, prev, prev_state);
+ try_to_block_task(rq, prev, prev_state);
switch_count = &prev->nvcsw;
}
@@ -6711,11 +6722,10 @@ static void __sched notrace __schedule(int sched_mode)
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
-#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
-#endif
- if (likely(prev != next)) {
+ is_switch = prev != next;
+ if (likely(is_switch)) {
rq->nr_switches++;
/*
* RCU users of rcu_dereference(rq->curr) may not see
@@ -6748,7 +6758,8 @@ picked:
migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
- psi_sched_switch(prev, next, block);
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
+ prev->se.sched_delayed);
trace_sched_switch(preempt, prev, next, prev_state);
@@ -6759,6 +6770,7 @@ picked:
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
+ trace_sched_exit_tp(is_switch, CALLER_ADDR0);
}
void __noreturn do_task_dead(void)
@@ -6803,7 +6815,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* deadlock if the callback attempts to acquire a lock which is
* already acquired.
*/
- SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
+ WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
@@ -7086,7 +7098,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
+ WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -7276,12 +7288,12 @@ out_unlock:
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
{
- if (should_resched(0)) {
+ if (should_resched(0) && !irqs_disabled()) {
preempt_schedule_common();
return 1;
}
/*
- * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick
* whether the current CPU is in an RCU read-side critical section,
* so the tick can report quiescent states even for CPUs looping
* in kernel context. In contrast, in non-preemptible kernels,
@@ -7290,6 +7302,8 @@ int __sched __cond_resched(void)
* RCU quiescent state. Therefore, the following code causes
* cond_resched() to report a quiescent state, but only when RCU
* is in urgent need of one.
+ * A third case, preemptible, but non-PREEMPT_RCU provides for
+ * urgently needed quiescent states via rcu_flavor_sched_clock_irq().
*/
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
@@ -7638,10 +7652,57 @@ PREEMPT_MODEL_ACCESSOR(lazy);
#else /* !CONFIG_PREEMPT_DYNAMIC: */
+#define preempt_dynamic_mode -1
+
static inline void preempt_dynamic_init(void) { }
#endif /* CONFIG_PREEMPT_DYNAMIC */
+const char *preempt_modes[] = {
+ "none", "voluntary", "full", "lazy", NULL,
+};
+
+const char *preempt_model_str(void)
+{
+ bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) ||
+ IS_ENABLED(CONFIG_PREEMPT_LAZY));
+ static char buf[128];
+
+ if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) {
+ struct seq_buf s;
+
+ seq_buf_init(&s, buf, sizeof(buf));
+ seq_buf_puts(&s, "PREEMPT");
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ seq_buf_printf(&s, "%sRT%s",
+ brace ? "_{" : "_",
+ brace ? "," : "");
+
+ if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
+ seq_buf_printf(&s, "(%s)%s",
+ preempt_dynamic_mode > 0 ?
+ preempt_modes[preempt_dynamic_mode] : "undef",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ seq_buf_printf(&s, "LAZY%s",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
+
+ return seq_buf_str(&s);
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD))
+ return "VOLUNTARY";
+
+ return "NONE";
+}
+
int io_schedule_prepare(void)
{
int old_iowait = current->in_iowait;
@@ -7701,9 +7762,9 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n",
free, task_pid_nr(p), task_tgid_nr(p),
- ppid, read_task_thread_flags(p));
+ ppid, p->flags, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -7756,10 +7817,9 @@ void show_state_filter(unsigned int state_filter)
sched_show_task(p);
}
-#ifdef CONFIG_SCHED_DEBUG
if (!state_filter)
sysrq_sched_debug_show();
-#endif
+
rcu_read_unlock();
/*
* Only show locks if all tasks are dumped:
@@ -7930,19 +7990,26 @@ void sched_setnuma(struct task_struct *p, int nid)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Ensure that the idle task is using init_mm right before its CPU goes
- * offline.
+ * Invoked on the outgoing CPU in context of the CPU hotplug thread
+ * after ensuring that there are no user space tasks left on the CPU.
+ *
+ * If there is a lazy mm in use on the hotplug thread, drop it and
+ * switch to init_mm.
+ *
+ * The reference count on init_mm is dropped in finish_cpu().
*/
-void idle_task_exit(void)
+static void sched_force_init_mm(void)
{
struct mm_struct *mm = current->active_mm;
- BUG_ON(cpu_online(smp_processor_id()));
- BUG_ON(current != this_rq()->idle);
-
if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
+ mmgrab_lazy_tlb(&init_mm);
+ local_irq_disable();
+ current->active_mm = &init_mm;
+ switch_mm_irqs_off(mm, &init_mm, current);
+ local_irq_enable();
finish_arch_post_lock_switch();
+ mmdrop_lazy_tlb(mm);
}
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
@@ -8167,7 +8234,7 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- partition_sched_domains(1, NULL, NULL);
+ cpuset_reset_sched_domains();
if (--num_cpus_frozen)
return;
/*
@@ -8180,19 +8247,14 @@ static void cpuset_cpu_active(void)
cpuset_update_active_cpus();
}
-static int cpuset_cpu_inactive(unsigned int cpu)
+static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- int ret = dl_bw_check_overflow(cpu);
-
- if (ret)
- return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
- partition_sched_domains(1, NULL, NULL);
+ cpuset_reset_sched_domains();
}
- return 0;
}
static inline void sched_smt_present_inc(int cpu)
@@ -8254,6 +8316,11 @@ int sched_cpu_deactivate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
int ret;
+ ret = dl_bw_deactivate(cpu);
+
+ if (ret)
+ return ret;
+
/*
* Remove CPU from nohz.idle_cpus_mask to prevent participating in
* load balancing when not active
@@ -8299,15 +8366,7 @@ int sched_cpu_deactivate(unsigned int cpu)
return 0;
sched_update_numa(cpu, false);
- ret = cpuset_cpu_inactive(cpu);
- if (ret) {
- sched_smt_present_inc(cpu);
- sched_set_rq_online(rq, cpu);
- balance_push_set(cpu, false);
- set_cpu_active(cpu, true);
- sched_update_numa(cpu, true);
- return ret;
- }
+ cpuset_cpu_inactive(cpu);
sched_domains_numa_masks_clear(cpu);
return 0;
}
@@ -8344,6 +8403,7 @@ int sched_cpu_starting(unsigned int cpu)
int sched_cpu_wait_empty(unsigned int cpu)
{
balance_hotplug_wait();
+ sched_force_init_mm();
return 0;
}
@@ -8415,9 +8475,9 @@ void __init sched_init_smp(void)
* CPU masks are stable and all blatant races in the below code cannot
* happen.
*/
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
sched_init_domains(cpu_active_mask);
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
@@ -9007,7 +9067,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static struct task_group *sched_get_task_group(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -9019,13 +9079,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
-
- return tg;
-}
-
-static void sched_change_group(struct task_struct *tsk, struct task_group *group)
-{
- tsk->sched_task_group = group;
+ tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
@@ -9042,24 +9096,15 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
* its new group.
*/
-void sched_move_task(struct task_struct *tsk)
+void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct task_group *group;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- /*
- * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
- * group changes.
- */
- group = sched_get_task_group(tsk);
- if (group == tsk->sched_task_group)
- return;
-
update_rq_clock(rq);
running = task_current_donor(rq, tsk);
@@ -9070,8 +9115,9 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, group);
- scx_move_task(tsk);
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -9172,7 +9218,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(task, css, tset)
- sched_move_task(task);
+ sched_move_task(task, false);
scx_cgroup_finish_attach();
}
@@ -9193,7 +9239,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
unsigned int clamps;
lockdep_assert_held(&uclamp_mutex);
- SCHED_WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
@@ -9285,7 +9331,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
if (req.ret)
return req.ret;
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
guard(mutex)(&uclamp_mutex);
guard(rcu)();
@@ -10528,7 +10574,7 @@ static void task_mm_cid_work(struct callback_head *work)
struct mm_struct *mm;
int weight, cpu;
- SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+ WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
work->next = work; /* Prevent double-add */
if (t->flags & PF_EXITING)
@@ -10590,7 +10636,7 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
return;
/* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
+ task_work_add(curr, work, TWA_RESUME);
}
void sched_mm_cid_exit_signals(struct task_struct *t)