diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 149 |
1 files changed, 82 insertions, 67 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67189907214d..87540217fc09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,6 @@ #include "autogroup.h" #include "pelt.h" #include "smp.h" -#include "stats.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" @@ -119,7 +118,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -129,7 +127,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); */ #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | -const_debug unsigned int sysctl_sched_features = +__read_mostly unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT @@ -143,13 +141,12 @@ const_debug unsigned int sysctl_sched_features = */ __read_mostly int sysctl_resched_latency_warn_ms = 100; __read_mostly int sysctl_resched_latency_warn_once = 1; -#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; +__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -800,11 +797,10 @@ void update_rq_clock(struct rq *rq) if (rq->clock_update_flags & RQCF_ACT_SKIP) return; -#ifdef CONFIG_SCHED_DEBUG if (sched_feat(WARN_DOUBLE_CLOCK)) - SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; -#endif + clock = sched_clock_cpu(cpu_of(rq)); scx_rq_clock_update(rq, clock); @@ -916,8 +912,7 @@ static void hrtick_rq_init(struct rq *rq) #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - rq->hrtick_timer.function = hrtick; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1720,7 +1715,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; - SCHED_WARN_ON(!bucket->tasks); + WARN_ON_ONCE(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; @@ -1740,7 +1735,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fix up the expected value. */ - SCHED_WARN_ON(bucket->value > rq_clamp); + WARN_ON_ONCE(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); uclamp_rq_set(rq, clamp_id, bkt_clamp); @@ -1757,7 +1752,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1784,7 +1779,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1942,12 +1937,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, } if (update_root_tg) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_update_root_tg(); } if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_sync_util_min_rt_default(); } @@ -2122,7 +2117,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + WARN_ON_ONCE(flags & DEQUEUE_SLEEP); WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq); @@ -2727,7 +2722,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) * XXX do further audits, this smells like something putrid. */ if (ctx->flags & SCA_MIGRATE_DISABLE) - SCHED_WARN_ON(!p->on_cpu); + WARN_ON_ONCE(!p->on_cpu); else lockdep_assert_held(&p->pi_lock); @@ -3292,7 +3287,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { -#ifdef CONFIG_SCHED_DEBUG unsigned int state = READ_ONCE(p->__state); /* @@ -3330,7 +3324,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(!cpu_online(new_cpu)); WARN_ON_ONCE(is_migration_disabled(p)); -#endif trace_sched_migrate_task(p, new_cpu); @@ -3922,13 +3915,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { - /* - * The BPF scheduler may depend on select_task_rq() being invoked during - * wakeups. In addition, @p may end up executing on a different CPU - * regardless of what happens in the wakeup path making the ttwu_queue - * optimization less meaningful. Skip if on SCX. - */ - if (task_on_scx(p)) + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ + if (!scx_allow_ttwu_queue(p)) return false; /* @@ -4196,7 +4184,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4490,7 +4478,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -5578,7 +5566,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_SCHED_DEBUG static u64 cpu_resched_latency(struct rq *rq) { int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); @@ -5623,9 +5610,6 @@ static int __init setup_resched_latency_warn_ms(char *str) return 1; } __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); -#else -static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } -#endif /* CONFIG_SCHED_DEBUG */ /* * This function gets called by the timer code, with HZ frequency. @@ -5746,7 +5730,7 @@ static void sched_tick_remote(struct work_struct *work) * we are always sure that there is no proxy (only a * single task is running). */ - SCHED_WARN_ON(rq->curr != rq->donor); + WARN_ON_ONCE(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5966,7 +5950,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CT_STATE_USER); + WARN_ON_ONCE(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6719,9 +6703,7 @@ static void __sched notrace __schedule(int sched_mode) picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); -#ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -#endif if (likely(prev != next)) { rq->nr_switches++; @@ -6812,7 +6794,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * deadlock if the callback attempts to acquire a lock which is * already acquired. */ - SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); + WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, @@ -7095,7 +7077,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); + WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -7290,7 +7272,7 @@ int __sched __cond_resched(void) return 1; } /* - * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick * whether the current CPU is in an RCU read-side critical section, * so the tick can report quiescent states even for CPUs looping * in kernel context. In contrast, in non-preemptible kernels, @@ -7299,6 +7281,8 @@ int __sched __cond_resched(void) * RCU quiescent state. Therefore, the following code causes * cond_resched() to report a quiescent state, but only when RCU * is in urgent need of one. + * A third case, preemptible, but non-PREEMPT_RCU provides for + * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); @@ -7647,10 +7631,57 @@ PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ +#define preempt_dynamic_mode -1 + static inline void preempt_dynamic_init(void) { } #endif /* CONFIG_PREEMPT_DYNAMIC */ +const char *preempt_modes[] = { + "none", "voluntary", "full", "lazy", NULL, +}; + +const char *preempt_model_str(void) +{ + bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) && + (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) || + IS_ENABLED(CONFIG_PREEMPT_LAZY)); + static char buf[128]; + + if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) { + struct seq_buf s; + + seq_buf_init(&s, buf, sizeof(buf)); + seq_buf_puts(&s, "PREEMPT"); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + seq_buf_printf(&s, "%sRT%s", + brace ? "_{" : "_", + brace ? "," : ""); + + if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { + seq_buf_printf(&s, "(%s)%s", + preempt_dynamic_mode > 0 ? + preempt_modes[preempt_dynamic_mode] : "undef", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + seq_buf_printf(&s, "LAZY%s", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD)) + return "VOLUNTARY"; + + return "NONE"; +} + int io_schedule_prepare(void) { int old_iowait = current->in_iowait; @@ -7765,10 +7796,9 @@ void show_state_filter(unsigned int state_filter) sched_show_task(p); } -#ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -#endif + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: @@ -8183,7 +8213,7 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); if (--num_cpus_frozen) return; /* @@ -8202,7 +8232,7 @@ static void cpuset_cpu_inactive(unsigned int cpu) cpuset_update_active_cpus(); } else { num_cpus_frozen++; - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); } } @@ -8424,9 +8454,9 @@ void __init sched_init_smp(void) * CPU masks are stable and all blatant races in the below code cannot * happen. */ - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); sched_init_domains(cpu_active_mask); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) @@ -9016,7 +9046,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static struct task_group *sched_get_task_group(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -9028,13 +9058,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - - return tg; -} - -static void sched_change_group(struct task_struct *tsk, struct task_group *group) -{ - tsk->sched_task_group = group; + tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -9055,20 +9079,11 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct task_group *group; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - /* - * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous - * group changes. - */ - group = sched_get_task_group(tsk); - if (group == tsk->sched_task_group) - return; - update_rq_clock(rq); running = task_current_donor(rq, tsk); @@ -9079,7 +9094,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, group); + sched_change_group(tsk); if (!for_autogroup) scx_cgroup_move_task(tsk); @@ -9203,7 +9218,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) unsigned int clamps; lockdep_assert_held(&uclamp_mutex); - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent @@ -9295,7 +9310,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); guard(mutex)(&uclamp_mutex); guard(rcu)(); @@ -10538,7 +10553,7 @@ static void task_mm_cid_work(struct callback_head *work) struct mm_struct *mm; int weight, cpu; - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); work->next = work; /* Prevent double-add */ if (t->flags & PF_EXITING) |