summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 13:33:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 13:33:36 -0700
commit1c3b68f0d55b5932eb38eda602a61aec6d6f5e5e (patch)
treeec22e8344526e4f2968507472f3f578429392dd4 /kernel
parent33c66eb5e9844429911bf5478c96c60f9f8af9d0 (diff)
parent78cde54ea5f03398f1cf6656de2472068f6da966 (diff)
downloadlwn-1c3b68f0d55b5932eb38eda602a61aec6d6f5e5e.tar.gz
lwn-1c3b68f0d55b5932eb38eda602a61aec6d6f5e5e.zip
Merge tag 'sched-core-2026-04-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Fair scheduling updates: - Skip SCHED_IDLE rq for SCHED_IDLE tasks (Christian Loehle) - Remove superfluous rcu_read_lock() in the wakeup path (K Prateek Nayak) - Simplify the entry condition for update_idle_cpu_scan() (K Prateek Nayak) - Simplify SIS_UTIL handling in select_idle_cpu() (K Prateek Nayak) - Avoid overflow in enqueue_entity() (K Prateek Nayak) - Update overutilized detection (Vincent Guittot) - Prevent negative lag increase during delayed dequeue (Vincent Guittot) - Clear buddies for preempt_short (Vincent Guittot) - Implement more complex proportional newidle balance (Peter Zijlstra) - Increase weight bits for avg_vruntime (Peter Zijlstra) - Use full weight to __calc_delta() (Peter Zijlstra) RT and DL scheduling updates: - Fix incorrect schedstats for rt and dl thread (Dengjun Su) - Skip group schedulable check with rt_group_sched=0 (Michal Koutný) - Move group schedulability check to sched_rt_global_validate() (Michal Koutný) - Add reporting of runtime left & abs deadline to sched_getattr() for DEADLINE tasks (Tommaso Cucinotta) Scheduling topology updates by K Prateek Nayak: - Compute sd_weight considering cpuset partitions - Extract "imb_numa_nr" calculation into a separate helper - Allocate per-CPU sched_domain_shared in s_data - Switch to assigning "sd->shared" from s_data - Remove sched_domain_shared allocation with sd_data Energy-aware scheduling updates: - Filter false overloaded_group case for EAS (Vincent Guittot) - PM: EM: Switch to rcu_dereference_all() in wakeup path (Dietmar Eggemann) Infrastructure updates: - Replace use of system_unbound_wq with system_dfl_wq (Marco Crivellari) Proxy scheduling updates by John Stultz: - Make class_schedulers avoid pushing current, and get rid of proxy_tag_curr() - Minimise repeated sched_proxy_exec() checking - Fix potentially missing balancing with Proxy Exec - Fix and improve task::blocked_on et al handling - Add assert_balance_callbacks_empty() helper - Add logic to zap balancing callbacks if we pick again - Move attach_one_task() and attach_task() helpers to sched.h - Handle blocked-waiter migration (and return migration) - Add K Prateek Nayak to scheduler reviewers for proxy execution Misc cleanups and fixes by John Stultz, Joseph Salisbury, Peter Zijlstra, K Prateek Nayak, Michal Koutný, Randy Dunlap, Shrikanth Hegde, Vincent Guittot, Zhan Xusheng, Xie Yuanbin and Vincent Guittot" * tag 'sched-core-2026-04-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits) sched/eevdf: Clear buddies for preempt_short sched/rt: Cleanup global RT bandwidth functions sched/rt: Move group schedulability check to sched_rt_global_validate() sched/rt: Skip group schedulable check with rt_group_sched=0 sched/fair: Avoid overflow in enqueue_entity() sched: Use u64 for bandwidth ratio calculations sched/fair: Prevent negative lag increase during delayed dequeue sched/fair: Use sched_energy_enabled() sched: Handle blocked-waiter migration (and return migration) sched: Move attach_one_task and attach_task helpers to sched.h sched: Add logic to zap balance callbacks if we pick again sched: Add assert_balance_callbacks_empty helper sched/locking: Add special p->blocked_on==PROXY_WAKING value for proxy return-migration sched: Fix modifying donor->blocked on without proper locking locking: Add task::blocked_lock to serialize blocked_on state sched: Fix potentially missing balancing with Proxy Exec sched: Minimise repeated sched_proxy_exec() checking sched: Make class_schedulers avoid pushing current, and get rid of proxy_tag_curr() MAINTAINERS: Add K Prateek Nayak to scheduler reviewers sched/core: Get this cpu once in ttwu_queue_cond() ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/locking/mutex-debug.c4
-rw-r--r--kernel/locking/mutex.c40
-rw-r--r--kernel/locking/mutex.h6
-rw-r--r--kernel/locking/ww_mutex.h16
-rw-r--r--kernel/sched/core.c334
-rw-r--r--kernel/sched/deadline.c41
-rw-r--r--kernel/sched/debug.c14
-rw-r--r--kernel/sched/ext.c4
-rw-r--r--kernel/sched/fair.c513
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/rt.c64
-rw-r--r--kernel/sched/sched.h50
-rw-r--r--kernel/sched/syscalls.c16
-rw-r--r--kernel/sched/topology.c279
15 files changed, 984 insertions, 401 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index e14970fbc4ee..131ae7bbb0de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2113,6 +2113,7 @@ __latent_entropy struct task_struct *copy_process(
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
+ raw_spin_lock_init(&p->blocked_lock);
lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 94930d506bcf..785decd9d0c0 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -53,13 +53,13 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
lockdep_assert_held(&lock->wait_lock);
/* Current thread can't be already blocked (since it's executing!) */
- DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
+ DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task));
}
void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
- struct mutex *blocked_on = __get_task_blocked_on(task);
+ struct mutex *blocked_on = get_task_blocked_on(task);
DEBUG_LOCKS_WARN_ON(waiter->task != task);
DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 427187ff02db..186b463fe326 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -674,6 +674,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err_early_kill;
}
+ raw_spin_lock(&current->blocked_lock);
__set_task_blocked_on(current, lock);
set_current_state(state);
trace_contention_begin(lock, LCB_F_MUTEX);
@@ -687,8 +688,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
* the handoff.
*/
if (__mutex_trylock(lock))
- goto acquired;
+ break;
+ raw_spin_unlock(&current->blocked_lock);
/*
* Check for signals and kill conditions while holding
* wait_lock. This ensures the lock cancellation is ordered
@@ -711,12 +713,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
first = lock->first_waiter == &waiter;
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ raw_spin_lock(&current->blocked_lock);
/*
* As we likely have been woken up by task
* that has cleared our blocked_on state, re-set
* it to the lock we are trying to acquire.
*/
- set_task_blocked_on(current, lock);
+ __set_task_blocked_on(current, lock);
set_current_state(state);
/*
* Here we order against unlock; we must either see it change
@@ -727,25 +731,33 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
break;
if (first) {
- trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ bool opt_acquired;
+
/*
* mutex_optimistic_spin() can call schedule(), so
- * clear blocked on so we don't become unselectable
+ * we need to release these locks before calling it,
+ * and clear blocked on so we don't become unselectable
* to run.
*/
- clear_task_blocked_on(current, lock);
- if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ __clear_task_blocked_on(current, lock);
+ raw_spin_unlock(&current->blocked_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter);
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ raw_spin_lock(&current->blocked_lock);
+ __set_task_blocked_on(current, lock);
+
+ if (opt_acquired)
break;
- set_task_blocked_on(current, lock);
trace_contention_begin(lock, LCB_F_MUTEX);
}
-
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-acquired:
__clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
+ raw_spin_unlock(&current->blocked_lock);
if (ww_ctx) {
/*
@@ -773,11 +785,11 @@ skip_wait:
return 0;
err:
- __clear_task_blocked_on(current, lock);
+ clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
- WARN_ON(__get_task_blocked_on(current));
+ WARN_ON(get_task_blocked_on(current));
trace_contention_end(lock, ret);
raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
@@ -993,7 +1005,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
next = waiter->task;
debug_mutex_wake_waiter(lock, waiter);
- __clear_task_blocked_on(next, lock);
+ set_task_blocked_on_waking(next, lock);
wake_q_add(&wake_q, next);
}
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index b94ef40c1f48..3e263e98e5fc 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -48,6 +48,12 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock)
return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
}
+static inline struct mutex *get_task_blocked_on(struct task_struct *p)
+{
+ guard(raw_spinlock_irqsave)(&p->blocked_lock);
+ return __get_task_blocked_on(p);
+}
+
#ifdef CONFIG_DEBUG_MUTEXES
extern void debug_mutex_lock_common(struct mutex *lock,
struct mutex_waiter *waiter);
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index b1834ab7e782..016f0db892a5 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -290,11 +290,11 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
debug_mutex_wake_waiter(lock, waiter);
#endif
/*
- * When waking up the task to die, be sure to clear the
- * blocked_on pointer. Otherwise we can see circular
- * blocked_on relationships that can't resolve.
+ * When waking up the task to die, be sure to set the
+ * blocked_on to PROXY_WAKING. Otherwise we can see
+ * circular blocked_on relationships that can't resolve.
*/
- __clear_task_blocked_on(waiter->task, lock);
+ set_task_blocked_on_waking(waiter->task, lock);
wake_q_add(wake_q, waiter->task);
}
@@ -345,15 +345,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
*/
if (owner != current) {
/*
- * When waking up the task to wound, be sure to clear the
- * blocked_on pointer. Otherwise we can see circular
- * blocked_on relationships that can't resolve.
+ * When waking up the task to wound, be sure to set the
+ * blocked_on to PROXY_WAKING. Otherwise we can see
+ * circular blocked_on relationships that can't resolve.
*
* NOTE: We pass NULL here instead of lock, because we
* are waking the mutex owner, who may be currently
* blocked on a different mutex.
*/
- __clear_task_blocked_on(owner, NULL);
+ set_task_blocked_on_waking(owner, NULL);
wake_q_add(wake_q, owner);
}
return true;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4495929f4c9b..f351296922ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -687,11 +687,6 @@ bool raw_spin_rq_trylock(struct rq *rq)
}
}
-void raw_spin_rq_unlock(struct rq *rq)
-{
- raw_spin_unlock(rq_lockp(rq));
-}
-
/*
* double_rq_lock - safely lock two runqueues
*/
@@ -3905,6 +3900,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
+ int this_cpu = smp_processor_id();
+
/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
if (!scx_allow_ttwu_queue(p))
return false;
@@ -3929,10 +3926,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
*/
- if (!cpus_share_cache(smp_processor_id(), cpu))
+ if (!cpus_share_cache(this_cpu, cpu))
return true;
- if (cpu == smp_processor_id())
+ if (cpu == this_cpu)
return false;
/*
@@ -4796,7 +4793,7 @@ void sched_post_fork(struct task_struct *p)
scx_post_fork(p);
}
-unsigned long to_ratio(u64 period, u64 runtime)
+u64 to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
return BW_UNIT;
@@ -4971,6 +4968,34 @@ static inline void finish_task(struct task_struct *prev)
smp_store_release(&prev->on_cpu, 0);
}
+/*
+ * Only called from __schedule context
+ *
+ * There are some cases where we are going to re-do the action
+ * that added the balance callbacks. We may not be in a state
+ * where we can run them, so just zap them so they can be
+ * properly re-added on the next time around. This is similar
+ * handling to running the callbacks, except we just don't call
+ * them.
+ */
+static void zap_balance_callbacks(struct rq *rq)
+{
+ struct balance_callback *next, *head;
+ bool found = false;
+
+ lockdep_assert_rq_held(rq);
+
+ head = rq->balance_callback;
+ while (head) {
+ if (head == &balance_push_callback)
+ found = true;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+ }
+ rq->balance_callback = found ? &balance_push_callback : NULL;
+}
+
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
@@ -5740,7 +5765,7 @@ static void sched_tick_remote(struct work_struct *work)
os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
if (os == TICK_SCHED_REMOTE_RUNNING)
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ queue_delayed_work(system_dfl_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
@@ -5759,7 +5784,7 @@ static void sched_tick_start(int cpu)
if (os == TICK_SCHED_REMOTE_OFFLINE) {
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ queue_delayed_work(system_dfl_wq, &twork->work, HZ);
}
}
@@ -6557,6 +6582,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
if (signal_pending_state(task_state, p)) {
WRITE_ONCE(p->__state, TASK_RUNNING);
*task_state_p = TASK_RUNNING;
+ set_task_blocked_on_waking(p, NULL);
+
return false;
}
@@ -6594,6 +6621,21 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
}
#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void proxy_set_task_cpu(struct task_struct *p, int cpu)
+{
+ unsigned int wake_cpu;
+
+ /*
+ * Since we are enqueuing a blocked task on a cpu it may
+ * not be able to run on, preserve wake_cpu when we
+ * __set_task_cpu so we can return the task to where it
+ * was previously runnable.
+ */
+ wake_cpu = p->wake_cpu;
+ __set_task_cpu(p, cpu);
+ p->wake_cpu = wake_cpu;
+}
+
static inline struct task_struct *proxy_resched_idle(struct rq *rq)
{
put_prev_set_next_task(rq, rq->donor, rq->idle);
@@ -6602,7 +6644,7 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq)
return rq->idle;
}
-static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static bool proxy_deactivate(struct rq *rq, struct task_struct *donor)
{
unsigned long state = READ_ONCE(donor->__state);
@@ -6622,17 +6664,140 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
return try_to_block_task(rq, donor, &state, true);
}
-static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf)
+ __releases(__rq_lockp(rq))
+{
+ /*
+ * The class scheduler may have queued a balance callback
+ * from pick_next_task() called earlier.
+ *
+ * So here we have to zap callbacks before unlocking the rq
+ * as another CPU may jump in and call sched_balance_rq
+ * which can trip the warning in rq_pin_lock() if we
+ * leave callbacks set.
+ *
+ * After we later reaquire the rq lock, we will force __schedule()
+ * to pick_again, so the callbacks will get re-established.
+ */
+ zap_balance_callbacks(rq);
+ rq_unpin_lock(rq, rf);
+ raw_spin_rq_unlock(rq);
+}
+
+static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(__rq_lockp(rq))
+{
+ raw_spin_rq_lock(rq);
+ rq_repin_lock(rq, rf);
+ update_rq_clock(rq);
+}
+
+/*
+ * If the blocked-on relationship crosses CPUs, migrate @p to the
+ * owner's CPU.
+ *
+ * This is because we must respect the CPU affinity of execution
+ * contexts (owner) but we can ignore affinity for scheduling
+ * contexts (@p). So we have to move scheduling contexts towards
+ * potential execution contexts.
+ *
+ * Note: The owner can disappear, but simply migrate to @target_cpu
+ * and leave that CPU to sort things out.
+ */
+static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int target_cpu)
+ __must_hold(__rq_lockp(rq))
+{
+ struct rq *target_rq = cpu_rq(target_cpu);
+
+ lockdep_assert_rq_held(rq);
+ WARN_ON(p == rq->curr);
+ /*
+ * Since we are migrating a blocked donor, it could be rq->donor,
+ * and we want to make sure there aren't any references from this
+ * rq to it before we drop the lock. This avoids another cpu
+ * jumping in and grabbing the rq lock and referencing rq->donor
+ * or cfs_rq->curr, etc after we have migrated it to another cpu,
+ * and before we pick_again in __schedule.
+ *
+ * So call proxy_resched_idle() to drop the rq->donor references
+ * before we release the lock.
+ */
+ proxy_resched_idle(rq);
+
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
+ proxy_set_task_cpu(p, target_cpu);
+
+ proxy_release_rq_lock(rq, rf);
+
+ attach_one_task(target_rq, p);
+
+ proxy_reacquire_rq_lock(rq, rf);
+}
+
+static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p)
+ __must_hold(__rq_lockp(rq))
{
- if (!__proxy_deactivate(rq, donor)) {
+ struct rq *task_rq, *target_rq = NULL;
+ int cpu, wake_flag = WF_TTWU;
+
+ lockdep_assert_rq_held(rq);
+ WARN_ON(p == rq->curr);
+
+ if (p == rq->donor)
+ proxy_resched_idle(rq);
+
+ proxy_release_rq_lock(rq, rf);
+ /*
+ * We drop the rq lock, and re-grab task_rq_lock to get
+ * the pi_lock (needed for select_task_rq) as well.
+ */
+ scoped_guard (task_rq_lock, p) {
+ task_rq = scope.rq;
+
+ /*
+ * Since we let go of the rq lock, the task may have been
+ * woken or migrated to another rq before we got the
+ * task_rq_lock. So re-check we're on the same RQ. If
+ * not, the task has already been migrated and that CPU
+ * will handle any futher migrations.
+ */
+ if (task_rq != rq)
+ break;
+
+ /*
+ * Similarly, if we've been dequeued, someone else will
+ * wake us
+ */
+ if (!task_on_rq_queued(p))
+ break;
+
/*
- * XXX: For now, if deactivation failed, set donor
- * as unblocked, as we aren't doing proxy-migrations
- * yet (more logic will be needed then).
+ * Since we should only be calling here from __schedule()
+ * -> find_proxy_task(), no one else should have
+ * assigned current out from under us. But check and warn
+ * if we see this, then bail.
*/
- donor->blocked_on = NULL;
+ if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) {
+ WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n",
+ __func__, cpu_of(task_rq),
+ p->comm, p->pid, p->on_cpu);
+ break;
+ }
+
+ update_rq_clock(task_rq);
+ deactivate_task(task_rq, p, DEQUEUE_NOCLOCK);
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
+ set_task_cpu(p, cpu);
+ target_rq = cpu_rq(cpu);
+ clear_task_blocked_on(p, NULL);
}
- return NULL;
+
+ if (target_rq)
+ attach_one_task(target_rq, p);
+
+ proxy_reacquire_rq_lock(rq, rf);
}
/*
@@ -6646,31 +6811,41 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
* p->pi_lock
* rq->lock
* mutex->wait_lock
+ * p->blocked_lock
*
* Returns the task that is going to be used as execution context (the one
* that is actually going to be run on cpu_of(rq)).
*/
static struct task_struct *
find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+ __must_hold(__rq_lockp(rq))
{
struct task_struct *owner = NULL;
+ bool curr_in_chain = false;
int this_cpu = cpu_of(rq);
struct task_struct *p;
struct mutex *mutex;
+ int owner_cpu;
/* Follow blocked_on chain. */
- for (p = donor; task_is_blocked(p); p = owner) {
- mutex = p->blocked_on;
- /* Something changed in the chain, so pick again */
- if (!mutex)
- return NULL;
+ for (p = donor; (mutex = p->blocked_on); p = owner) {
+ /* if its PROXY_WAKING, do return migration or run if current */
+ if (mutex == PROXY_WAKING) {
+ if (task_current(rq, p)) {
+ clear_task_blocked_on(p, PROXY_WAKING);
+ return p;
+ }
+ goto force_return;
+ }
+
/*
* By taking mutex->wait_lock we hold off concurrent mutex_unlock()
* and ensure @owner sticks around.
*/
guard(raw_spinlock)(&mutex->wait_lock);
+ guard(raw_spinlock)(&p->blocked_lock);
- /* Check again that p is blocked with wait_lock held */
+ /* Check again that p is blocked with blocked_lock held */
if (mutex != __get_task_blocked_on(p)) {
/*
* Something changed in the blocked_on chain and
@@ -6681,20 +6856,39 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
return NULL;
}
+ if (task_current(rq, p))
+ curr_in_chain = true;
+
owner = __mutex_owner(mutex);
if (!owner) {
- __clear_task_blocked_on(p, mutex);
- return p;
+ /*
+ * If there is no owner, either clear blocked_on
+ * and return p (if it is current and safe to
+ * just run on this rq), or return-migrate the task.
+ */
+ if (task_current(rq, p)) {
+ __clear_task_blocked_on(p, NULL);
+ return p;
+ }
+ goto force_return;
}
if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
/* XXX Don't handle blocked owners/delayed dequeue yet */
- return proxy_deactivate(rq, donor);
+ if (curr_in_chain)
+ return proxy_resched_idle(rq);
+ goto deactivate;
}
- if (task_cpu(owner) != this_cpu) {
- /* XXX Don't handle migrations yet */
- return proxy_deactivate(rq, donor);
+ owner_cpu = task_cpu(owner);
+ if (owner_cpu != this_cpu) {
+ /*
+ * @owner can disappear, simply migrate to @owner_cpu
+ * and leave that CPU to sort things out.
+ */
+ if (curr_in_chain)
+ return proxy_resched_idle(rq);
+ goto migrate_task;
}
if (task_on_rq_migrating(owner)) {
@@ -6751,9 +6945,20 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
* guarantee its existence, as per ttwu_remote().
*/
}
-
WARN_ON_ONCE(owner && !owner->on_rq);
return owner;
+
+deactivate:
+ if (proxy_deactivate(rq, donor))
+ return NULL;
+ /* If deactivate fails, force return */
+ p = donor;
+force_return:
+ proxy_force_return(rq, rf, p);
+ return NULL;
+migrate_task:
+ proxy_migrate_task(rq, rf, p, owner_cpu);
+ return NULL;
}
#else /* SCHED_PROXY_EXEC */
static struct task_struct *
@@ -6764,23 +6969,6 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
}
#endif /* SCHED_PROXY_EXEC */
-static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
-{
- if (!sched_proxy_exec())
- return;
- /*
- * pick_next_task() calls set_next_task() on the chosen task
- * at some point, which ensures it is not push/pullable.
- * However, the chosen/donor task *and* the mutex owner form an
- * atomic pair wrt push/pull.
- *
- * Make sure owner we run is not pushable. Unfortunately we can
- * only deal with that by means of a dequeue/enqueue cycle. :-/
- */
- dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
- enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
-}
-
/*
* __schedule() is the main scheduler function.
*
@@ -6907,16 +7095,45 @@ static void __sched notrace __schedule(int sched_mode)
}
pick_again:
+ assert_balance_callbacks_empty(rq);
next = pick_next_task(rq, rq->donor, &rf);
- rq_set_donor(rq, next);
rq->next_class = next->sched_class;
- if (unlikely(task_is_blocked(next))) {
- next = find_proxy_task(rq, next, &rf);
- if (!next)
- goto pick_again;
- if (next == rq->idle)
- goto keep_resched;
+ if (sched_proxy_exec()) {
+ struct task_struct *prev_donor = rq->donor;
+
+ rq_set_donor(rq, next);
+ if (unlikely(next->blocked_on)) {
+ next = find_proxy_task(rq, next, &rf);
+ if (!next) {
+ zap_balance_callbacks(rq);
+ goto pick_again;
+ }
+ if (next == rq->idle) {
+ zap_balance_callbacks(rq);
+ goto keep_resched;
+ }
+ }
+ if (rq->donor == prev_donor && prev != next) {
+ struct task_struct *donor = rq->donor;
+ /*
+ * When transitioning like:
+ *
+ * prev next
+ * donor: B B
+ * curr: A B or C
+ *
+ * then put_prev_set_next_task() will not have done
+ * anything, since B == B. However, A might have
+ * missed a RT/DL balance opportunity due to being
+ * on_cpu.
+ */
+ donor->sched_class->put_prev_task(rq, donor, donor);
+ donor->sched_class->set_next_task(rq, donor, true);
+ }
+ } else {
+ rq_set_donor(rq, next);
}
+
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
@@ -6932,9 +7149,6 @@ keep_resched:
*/
RCU_INIT_POINTER(rq->curr, next);
- if (!task_current_donor(rq, next))
- proxy_tag_curr(rq, next);
-
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -6968,10 +7182,6 @@ keep_resched:
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
- /* In case next was already curr but just got blocked_donor */
- if (!task_current_donor(rq, next))
- proxy_tag_curr(rq, next);
-
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq, NULL);
hrtick_schedule_exit(rq);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8c3c1fe8d3a6..0b8de7156f38 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2142,10 +2142,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
int flags)
{
struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
if (!schedstat_enabled())
return;
+ if (p != rq->curr)
+ update_stats_wait_end_dl(dl_rq, dl_se);
+
if ((flags & DEQUEUE_SLEEP)) {
unsigned int state;
@@ -2801,12 +2805,26 @@ static int find_later_rq(struct task_struct *task)
static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
{
- struct task_struct *p;
+ struct task_struct *i, *p = NULL;
+ struct rb_node *next_node;
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+ next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+ while (next_node) {
+ i = __node_2_pdl(next_node);
+ /* make sure task isn't on_cpu (possible with proxy-exec) */
+ if (!task_on_cpu(rq, i)) {
+ p = i;
+ break;
+ }
+
+ next_node = rb_next(next_node);
+ }
+
+ if (!p)
+ return NULL;
WARN_ON_ONCE(rq->cpu != task_cpu(p));
WARN_ON_ONCE(task_current(rq, p));
@@ -3613,13 +3631,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
-void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
{
struct sched_dl_entity *dl_se = &p->dl;
+ struct rq *rq = task_rq(p);
+ u64 adj_deadline;
attr->sched_priority = p->rt_priority;
- attr->sched_runtime = dl_se->dl_runtime;
- attr->sched_deadline = dl_se->dl_deadline;
+ if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) {
+ guard(raw_spinlock_irq)(&rq->__lock);
+ update_rq_clock(rq);
+ if (task_current(rq, p))
+ update_curr_dl(rq);
+
+ attr->sched_runtime = dl_se->runtime;
+ adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns();
+ attr->sched_deadline = adj_deadline;
+ } else {
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ }
attr->sched_period = dl_se->dl_period;
attr->sched_flags &= ~SCHED_DL_FLAGS;
attr->sched_flags |= dl_se->flags;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 15bf45b6f912..74c1617cf652 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,6 +8,7 @@
*/
#include <linux/debugfs.h>
#include <linux/nmi.h>
+#include <linux/log2.h>
#include "sched.h"
/*
@@ -901,11 +902,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
+ s64 zero_vruntime = -1, sum_w_vruntime = -1;
u64 avruntime;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
+ unsigned int sum_shift;
unsigned long flags;
+ u64 sum_weight;
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
@@ -926,6 +930,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
if (last)
right_vruntime = last->vruntime;
zero_vruntime = cfs_rq->zero_vruntime;
+ sum_w_vruntime = cfs_rq->sum_w_vruntime;
+ sum_weight = cfs_rq->sum_weight;
+ sum_shift = cfs_rq->sum_shift;
avruntime = avg_vruntime(cfs_rq);
raw_spin_rq_unlock_irqrestore(rq, flags);
@@ -935,6 +942,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
SPLIT_NS(zero_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
+ sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
+ SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight",
+ sum_weight);
+ SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 064eaa76be4b..04fc5c9fee14 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2837,7 +2837,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched();
}
- queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ queue_delayed_work(system_dfl_wq, to_delayed_work(work),
READ_ONCE(scx_watchdog_timeout) / 2);
}
@@ -5164,7 +5164,7 @@ static void scx_enable_workfn(struct kthread_work *work)
WRITE_ONCE(scx_watchdog_timeout, timeout);
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
- queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+ queue_delayed_work(system_dfl_wq, &scx_watchdog_work,
READ_ONCE(scx_watchdog_timeout) / 2);
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2be80780ff51..69361c63353a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -225,6 +225,7 @@ void __init sched_init_granularity(void)
update_sysctl();
}
+#ifndef CONFIG_64BIT
#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
@@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+#else
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
+{
+ return (delta_exec * weight) / lw->weight;
+}
+#endif
/*
* delta /= w
@@ -665,25 +672,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Since zero_vruntime closely tracks the per-task service, these
* deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
* induced in the system due to quantisation.
- *
- * Also, we use scale_load_down() to reduce the size.
- *
- * As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
+static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
+{
+#ifdef CONFIG_64BIT
+ if (cfs_rq->sum_shift)
+ w = max(2UL, w >> cfs_rq->sum_shift);
+#endif
+ return w;
+}
+
+static inline void
+__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+ s64 w_vruntime, key = entity_key(cfs_rq, se);
+
+ w_vruntime = key * weight;
+ WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));
+
+ cfs_rq->sum_w_vruntime += w_vruntime;
+ cfs_rq->sum_weight += weight;
+}
+
static void
-sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned long weight = scale_load_down(se->load.weight);
- s64 key = entity_key(cfs_rq, se);
+ unsigned long weight;
+ s64 key, tmp;
+
+again:
+ weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+ key = entity_key(cfs_rq, se);
+
+ if (check_mul_overflow(key, weight, &key))
+ goto overflow;
+
+ if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
+ goto overflow;
- cfs_rq->sum_w_vruntime += key * weight;
+ cfs_rq->sum_w_vruntime = tmp;
cfs_rq->sum_weight += weight;
+ return;
+
+overflow:
+ /*
+ * There's gotta be a limit -- if we're still failing at this point
+ * there's really nothing much to be done about things.
+ */
+ BUG_ON(cfs_rq->sum_shift >= 10);
+ cfs_rq->sum_shift++;
+
+ /*
+ * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
+ */
+ cfs_rq->sum_w_vruntime = 0;
+ cfs_rq->sum_weight = 0;
+
+ for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
+ node; node = rb_next(node))
+ __sum_w_vruntime_add(cfs_rq, __node_2_se(node));
+
+ goto again;
+}
+
+static void
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (sched_feat(PARANOID_AVG))
+ return sum_w_vruntime_add_paranoid(cfs_rq, se);
+
+ __sum_w_vruntime_add(cfs_rq, se);
}
static void
sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned long weight = scale_load_down(se->load.weight);
+ unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
s64 key = entity_key(cfs_rq, se);
cfs_rq->sum_w_vruntime -= key * weight;
@@ -725,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
s64 runtime = cfs_rq->sum_w_vruntime;
if (curr) {
- unsigned long w = scale_load_down(curr->load.weight);
+ unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
runtime += entity_key(cfs_rq, curr) * w;
weight += w;
@@ -735,7 +800,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
if (runtime < 0)
runtime -= (weight - 1);
- delta = div_s64(runtime, weight);
+ delta = div64_long(runtime, weight);
} else if (curr) {
/*
* When there is but one element, it is the average.
@@ -764,17 +829,44 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
*
* -r_max < lag < max(r_max, q)
*/
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime)
{
u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
s64 vlag, limit;
+ vlag = avruntime - se->vruntime;
+ limit = calc_delta_fair(max_slice, se);
+
+ return clamp(vlag, -limit, limit);
+}
+
+/*
+ * Delayed dequeue aims to reduce the negative lag of a dequeued task. While
+ * updating the lag of an entity, check that negative lag didn't increase
+ * during the delayed dequeue period which would be unfair.
+ * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO
+ * is set.
+ *
+ * Return true if the lag has been adjusted.
+ */
+static __always_inline
+bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
+ bool ret;
+
WARN_ON_ONCE(!se->on_rq);
- vlag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_slice, se);
+ if (se->sched_delayed) {
+ /* previous vlag < 0 otherwise se would not be delayed */
+ vlag = max(vlag, se->vlag);
+ if (sched_feat(DELAY_ZERO))
+ vlag = min(vlag, 0);
+ }
+ ret = (vlag == se->vlag);
+ se->vlag = vlag;
- se->vlag = clamp(vlag, -limit, limit);
+ return ret;
}
/*
@@ -801,7 +893,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
- unsigned long weight = scale_load_down(curr->load.weight);
+ unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
avg += entity_key(cfs_rq, curr) * weight;
load += weight;
@@ -1024,7 +1116,7 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
/*
* Picking the ->next buddy will affect latency but not fairness.
*/
- if (sched_feat(PICK_BUDDY) &&
+ if (sched_feat(PICK_BUDDY) && protect &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
/* ->next will never be delayed */
WARN_ON_ONCE(cfs_rq->next->sched_delayed);
@@ -3841,23 +3933,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
se_weight(se) * -se->avg.load_sum);
}
-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+static void
+rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
+{
+ unsigned long old_weight = se->load.weight;
+
+ /*
+ * VRUNTIME
+ * --------
+ *
+ * COROLLARY #1: The virtual runtime of the entity needs to be
+ * adjusted if re-weight at !0-lag point.
+ *
+ * Proof: For contradiction assume this is not true, so we can
+ * re-weight without changing vruntime at !0-lag point.
+ *
+ * Weight VRuntime Avg-VRuntime
+ * before w v V
+ * after w' v' V'
+ *
+ * Since lag needs to be preserved through re-weight:
+ *
+ * lag = (V - v)*w = (V'- v')*w', where v = v'
+ * ==> V' = (V - v)*w/w' + v (1)
+ *
+ * Let W be the total weight of the entities before reweight,
+ * since V' is the new weighted average of entities:
+ *
+ * V' = (WV + w'v - wv) / (W + w' - w) (2)
+ *
+ * by using (1) & (2) we obtain:
+ *
+ * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+ * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+ * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+ * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+ *
+ * Since we are doing at !0-lag point which means V != v, we
+ * can simplify (3):
+ *
+ * ==> W / (W + w' - w) = w / w'
+ * ==> Ww' = Ww + ww' - ww
+ * ==> W * (w' - w) = w * (w' - w)
+ * ==> W = w (re-weight indicates w' != w)
+ *
+ * So the cfs_rq contains only one entity, hence vruntime of
+ * the entity @v should always equal to the cfs_rq's weighted
+ * average vruntime @V, which means we will always re-weight
+ * at 0-lag point, thus breach assumption. Proof completed.
+ *
+ *
+ * COROLLARY #2: Re-weight does NOT affect weighted average
+ * vruntime of all the entities.
+ *
+ * Proof: According to corollary #1, Eq. (1) should be:
+ *
+ * (V - v)*w = (V' - v')*w'
+ * ==> v' = V' - (V - v)*w/w' (4)
+ *
+ * According to the weighted average formula, we have:
+ *
+ * V' = (WV - wv + w'v') / (W - w + w')
+ * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+ * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+ * = (WV + w'V' - Vw) / (W - w + w')
+ *
+ * ==> V'*(W - w + w') = WV + w'V' - Vw
+ * ==> V' * (W - w) = (W - w) * V (5)
+ *
+ * If the entity is the only one in the cfs_rq, then reweight
+ * always occurs at 0-lag point, so V won't change. Or else
+ * there are other entities, hence W != w, then Eq. (5) turns
+ * into V' = V. So V won't change in either case, proof done.
+ *
+ *
+ * So according to corollary #1 & #2, the effect of re-weight
+ * on vruntime should be:
+ *
+ * v' = V' - (V - v) * w / w' (4)
+ * = V - (V - v) * w / w'
+ * = V - vl * w / w'
+ * = V - vl'
+ */
+ se->vlag = div64_long(se->vlag * old_weight, weight);
+
+ /*
+ * DEADLINE
+ * --------
+ *
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ *
+ * d' = v' + (d - v)*w/w'
+ * = V' - (V - v)*w/w' + (d - v)*w/w'
+ * = V - (V - v)*w/w' + (d - v)*w/w'
+ * = V + (d - V)*w/w'
+ */
+ if (se->rel_deadline)
+ se->deadline = div64_long(se->deadline * old_weight, weight);
+
+ if (rel_vprot)
+ se->vprot = div64_long(se->vprot * old_weight, weight);
+}
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
bool rel_vprot = false;
- u64 vprot;
+ u64 avruntime = 0;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
- update_entity_lag(cfs_rq, se);
- se->deadline -= se->vruntime;
+ avruntime = avg_vruntime(cfs_rq);
+ se->vlag = entity_lag(cfs_rq, se, avruntime);
+ se->deadline -= avruntime;
se->rel_deadline = 1;
if (curr && protect_slice(se)) {
- vprot = se->vprot - se->vruntime;
+ se->vprot -= avruntime;
rel_vprot = true;
}
@@ -3868,30 +4062,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
dequeue_load_avg(cfs_rq, se);
- /*
- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
- * we need to scale se->vlag when w_i changes.
- */
- se->vlag = div_s64(se->vlag * se->load.weight, weight);
- if (se->rel_deadline)
- se->deadline = div_s64(se->deadline * se->load.weight, weight);
-
- if (rel_vprot)
- vprot = div_s64(vprot * se->load.weight, weight);
+ rescale_entity(se, weight, rel_vprot);
update_load_set(&se->load, weight);
do {
u32 divider = get_pelt_divider(&se->avg);
-
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- place_entity(cfs_rq, se, 0);
if (rel_vprot)
- se->vprot = se->vruntime + vprot;
+ se->vprot += avruntime;
+ se->deadline += avruntime;
+ se->rel_deadline = 0;
+ se->vruntime = avruntime - se->vlag;
+
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -5165,6 +5352,7 @@ static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 vslice, vruntime = avg_vruntime(cfs_rq);
+ bool update_zero = false;
s64 lag = 0;
if (!se->custom_slice)
@@ -5181,7 +5369,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
- unsigned long load;
+ long load, weight;
lag = se->vlag;
@@ -5239,17 +5427,44 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
- load += scale_load_down(curr->load.weight);
+ load += avg_vruntime_weight(cfs_rq, curr->load.weight);
- lag *= load + scale_load_down(se->load.weight);
+ weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+ lag *= load + weight;
if (WARN_ON_ONCE(!load))
load = 1;
- lag = div_s64(lag, load);
+ lag = div64_long(lag, load);
+
+ /*
+ * A heavy entity (relative to the tree) will pull the
+ * avg_vruntime close to its vruntime position on enqueue. But
+ * the zero_vruntime point is only updated at the next
+ * update_deadline()/place_entity()/update_entity_lag().
+ *
+ * Specifically (see the comment near avg_vruntime_weight()):
+ *
+ * sum_w_vruntime = \Sum (v_i - v0) * w_i
+ *
+ * Note that if v0 is near a light entity, both terms will be
+ * small for the light entity, while in that case both terms
+ * are large for the heavy entity, leading to risk of
+ * overflow.
+ *
+ * OTOH if v0 is near the heavy entity, then the difference is
+ * larger for the light entity, but the factor is small, while
+ * for the heavy entity the difference is small but the factor
+ * is large. Avoiding the multiplication overflow.
+ */
+ if (weight > load)
+ update_zero = true;
}
se->vruntime = vruntime - lag;
- if (se->rel_deadline) {
+ if (update_zero)
+ update_zero_vruntime(cfs_rq, -lag);
+
+ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
@@ -5399,13 +5614,6 @@ static void clear_delayed(struct sched_entity *se)
}
}
-static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
-{
- clear_delayed(se);
- if (sched_feat(DELAY_ZERO) && se->vlag > 0)
- se->vlag = 0;
-}
-
static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -5431,6 +5639,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
update_load_avg(cfs_rq, se, 0);
+ update_entity_lag(cfs_rq, se);
set_delayed(se);
return false;
}
@@ -5470,7 +5679,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_group(se);
if (flags & DEQUEUE_DELAYED)
- finish_delayed_dequeue_entity(se);
+ clear_delayed(se);
if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
@@ -6866,16 +7075,15 @@ static inline void hrtick_update(struct rq *rq)
static inline bool cpu_overutilized(int cpu)
{
- unsigned long rq_util_min, rq_util_max;
+ unsigned long rq_util_max;
if (!sched_energy_enabled())
return false;
- rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
/* Return true only if the utilization doesn't fit CPU's capacity */
- return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+ return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu);
}
/*
@@ -6913,9 +7121,15 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-static int sched_idle_cpu(int cpu)
+static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
+{
+ return sched_idle_rq(rq) && !task_has_idle_policy(p);
+}
+
+static int choose_idle_cpu(int cpu, struct task_struct *p)
{
- return sched_idle_rq(cpu_rq(cpu));
+ return available_idle_cpu(cpu) ||
+ choose_sched_idle_rq(cpu_rq(cpu), p);
}
static void
@@ -6931,18 +7145,14 @@ requeue_delayed_entity(struct sched_entity *se)
WARN_ON_ONCE(!se->sched_delayed);
WARN_ON_ONCE(!se->on_rq);
- if (sched_feat(DELAY_ZERO)) {
- update_entity_lag(cfs_rq, se);
- if (se->vlag > 0) {
- cfs_rq->nr_queued--;
- if (se != cfs_rq->curr)
- __dequeue_entity(cfs_rq, se);
- se->vlag = 0;
- place_entity(cfs_rq, se, 0);
- if (se != cfs_rq->curr)
- __enqueue_entity(cfs_rq, se);
- cfs_rq->nr_queued++;
- }
+ if (update_entity_lag(cfs_rq, se)) {
+ cfs_rq->nr_queued--;
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ place_entity(cfs_rq, se, 0);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
}
update_load_avg(cfs_rq, se, 0);
@@ -7475,7 +7685,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
if (!sched_core_cookie_match(rq, p))
continue;
- if (sched_idle_cpu(i))
+ if (choose_sched_idle_rq(rq, p))
return i;
if (available_idle_cpu(i)) {
@@ -7566,8 +7776,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
- if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
- sched_cpu_cookie_match(cpu_rq(cpu), p))
+ if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -7640,7 +7849,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
- if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+ if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
+ cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu;
break;
}
@@ -7675,7 +7885,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
*/
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if (choose_idle_cpu(cpu, p))
return cpu;
}
@@ -7714,21 +7924,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
- struct sched_domain_shared *sd_share;
-
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_UTIL)) {
- sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
- if (sd_share) {
- /* because !--nr is the condition to stop scan */
- nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
- /* overloaded LLC is unlikely to have idle cpu/core */
- if (nr == 1)
- return -1;
- }
+ /*
+ * Increment because !--nr is the condition to stop scan.
+ *
+ * Since "sd" is "sd_llc" for target CPU dereferenced in the
+ * caller, it is safe to directly dereference "sd->shared".
+ * Topology bits always ensure it assigned for "sd_llc" abd it
+ * cannot disappear as long as we have a RCU protected
+ * reference to one the associated "sd" here.
+ */
+ nr = READ_ONCE(sd->shared->nr_idle_scan) + 1;
+ /* overloaded LLC is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
}
+ if (!cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr))
+ return -1;
+
if (static_branch_unlikely(&sched_cluster_active)) {
struct sched_group *sg = sd->groups;
@@ -7797,7 +8012,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
- if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ if (!choose_idle_cpu(cpu, p))
continue;
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7868,7 +8083,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
lockdep_assert_irqs_disabled();
- if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ if (choose_idle_cpu(target, p) &&
asym_fits_cpu(task_util, util_min, util_max, target))
return target;
@@ -7876,7 +8091,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ choose_idle_cpu(prev, p) &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7908,7 +8123,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+ choose_idle_cpu(recent_used_cpu, p) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
@@ -8408,10 +8623,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct perf_domain *pd;
struct energy_env eenv;
- rcu_read_lock();
pd = rcu_dereference_all(rd->pd);
if (!pd)
- goto unlock;
+ return target;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -8421,13 +8635,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
- goto unlock;
+ return target;
target = prev_cpu;
sync_entity_load_avg(&p->se);
if (!task_util_est(p) && p_util_min == 0)
- goto unlock;
+ return target;
eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -8522,7 +8736,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
prev_cpu);
/* CPU utilization has changed */
if (prev_delta < base_energy)
- goto unlock;
+ return target;
prev_delta -= base_energy;
prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
@@ -8546,7 +8760,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
max_spare_cap_cpu);
/* CPU utilization has changed */
if (cur_delta < base_energy)
- goto unlock;
+ return target;
cur_delta -= base_energy;
/*
@@ -8563,7 +8777,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
best_actual_cap = cpu_actual_cap;
}
}
- rcu_read_unlock();
if ((best_fits > prev_fits) ||
((best_fits > 0) && (best_delta < prev_delta)) ||
@@ -8571,11 +8784,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = best_energy_cpu;
return target;
-
-unlock:
- rcu_read_unlock();
-
- return target;
}
/*
@@ -8620,7 +8828,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
}
- rcu_read_lock();
for_each_domain(cpu, tmp) {
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
@@ -8646,14 +8853,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break;
}
- if (unlikely(sd)) {
- /* Slow path */
- new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
- } else if (wake_flags & WF_TTWU) { /* XXX always ? */
- /* Fast path */
- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- }
- rcu_read_unlock();
+ /* Slow path */
+ if (unlikely(sd))
+ return sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
+
+ /* Fast path */
+ if (wake_flags & WF_TTWU)
+ return select_idle_sibling(p, prev_cpu, new_cpu);
return new_cpu;
}
@@ -8944,8 +9150,10 @@ pick:
return;
preempt:
- if (preempt_action == PREEMPT_WAKEUP_SHORT)
+ if (preempt_action == PREEMPT_WAKEUP_SHORT) {
cancel_protect_slice(se);
+ clear_buddies(cfs_rq, se);
+ }
resched_curr_lazy(rq);
}
@@ -9793,32 +10001,6 @@ next:
}
/*
- * attach_task() -- attach the task detached by detach_task() to its new rq.
- */
-static void attach_task(struct rq *rq, struct task_struct *p)
-{
- lockdep_assert_rq_held(rq);
-
- WARN_ON_ONCE(task_rq(p) != rq);
- activate_task(rq, p, ENQUEUE_NOCLOCK);
- wakeup_preempt(rq, p, 0);
-}
-
-/*
- * attach_one_task() -- attaches the task returned from detach_one_task() to
- * its new rq.
- */
-static void attach_one_task(struct rq *rq, struct task_struct *p)
-{
- struct rq_flags rf;
-
- rq_lock(rq, &rf);
- update_rq_clock(rq);
- attach_task(rq, p);
- rq_unlock(rq, &rf);
-}
-
-/*
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
* new rq.
*/
@@ -10055,6 +10237,7 @@ struct sg_lb_stats {
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -10287,6 +10470,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
static inline bool
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
+ /*
+ * With EAS and uclamp, 1 CPU in the group must be overutilized to
+ * consider the group overloaded.
+ */
+ if (sched_energy_enabled() && !sgs->group_overutilized)
+ return false;
+
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
@@ -10470,14 +10660,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_overloaded: sched_group is overloaded
- * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- bool *sg_overloaded,
- bool *sg_overutilized)
+ bool *sg_overloaded)
{
int i, nr_running, local_group, sd_flags = env->sd->flags;
bool balancing_at_rd = !env->sd->parent;
@@ -10499,7 +10687,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;
if (cpu_overutilized(i))
- *sg_overutilized = 1;
+ sgs->group_overutilized = 1;
/*
* No need to call idle_cpu() if nr_running is not 0
@@ -11075,6 +11263,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
unsigned long sum_util)
{
struct sched_domain_shared *sd_share;
+ struct sched_domain *sd = env->sd;
int llc_weight, pct;
u64 x, y, tmp;
/*
@@ -11088,11 +11277,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
return;
- llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
- if (env->sd->span_weight != llc_weight)
- return;
-
- sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
+ sd_share = sd->shared;
if (!sd_share)
return;
@@ -11126,10 +11311,11 @@ static void update_idle_cpu_scan(struct lb_env *env,
*/
/* equation [3] */
x = sum_util;
+ llc_weight = sd->span_weight;
do_div(x, llc_weight);
/* equation [4] */
- pct = env->sd->imbalance_pct;
+ pct = sd->imbalance_pct;
tmp = x * x * pct * pct;
do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
@@ -11170,13 +11356,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
+ sg_overutilized |= sgs->group_overutilized;
+
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -12297,7 +12485,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
sd->newidle_success += success;
if (sd->newidle_call >= 1024) {
- sd->newidle_ratio = sd->newidle_success;
+ u64 now = sched_clock();
+ s64 delta = now - sd->newidle_stamp;
+ sd->newidle_stamp = now;
+ int ratio = 0;
+
+ if (delta < 0)
+ delta = 0;
+
+ if (sched_feat(NI_RATE)) {
+ /*
+ * ratio delta freq
+ *
+ * 1024 - 4 s - 128 Hz
+ * 512 - 2 s - 256 Hz
+ * 256 - 1 s - 512 Hz
+ * 128 - .5 s - 1024 Hz
+ * 64 - .25 s - 2048 Hz
+ */
+ ratio = delta >> 22;
+ }
+
+ ratio += sd->newidle_success;
+
+ sd->newidle_ratio = min(1024, ratio);
sd->newidle_call /= 2;
sd->newidle_success /= 2;
}
@@ -12344,7 +12555,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
- int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -12382,7 +12593,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu);
- busy = !idle && !sched_idle_cpu(cpu);
+ busy = !idle && !sched_idle_rq(rq);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
@@ -12427,14 +12638,14 @@ static inline int on_null_domain(struct rq *rq)
*/
static inline int find_new_ilb(void)
{
+ int this_cpu = smp_processor_id();
const struct cpumask *hk_mask;
int ilb_cpu;
hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
-
- if (ilb_cpu == smp_processor_id())
+ if (ilb_cpu == this_cpu)
continue;
if (idle_cpu(ilb_cpu))
@@ -13004,7 +13215,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
unsigned int weight = 1;
- if (sched_feat(NI_RANDOM)) {
+ if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
/*
* Throw a 1k sided dice; and only run
* newidle_balance according to the success
@@ -14030,7 +14241,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
}
if (ng) {
- gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d06228462607..84c4fe3abd74 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(DELAY_DEQUEUE, true)
SCHED_FEAT(DELAY_ZERO, true)
+SCHED_FEAT(PARANOID_AVG, false)
+
/*
* Allow wakeup-time preemption of the current task:
*/
@@ -131,3 +133,4 @@ SCHED_FEAT(LATENCY_WARN, false)
* Do newidle balancing proportional to its success rate using randomization.
*/
SCHED_FEAT(NI_RANDOM, true)
+SCHED_FEAT(NI_RATE, true)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f69e1f16d923..4ee8faf01441 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
int flags)
{
struct task_struct *p = NULL;
+ struct rq *rq = rq_of_rt_rq(rt_rq);
if (!schedstat_enabled())
return;
- if (rt_entity_is_task(rt_se))
+ if (rt_entity_is_task(rt_se)) {
p = rt_task_of(rt_se);
+ if (p != rq->curr)
+ update_stats_wait_end_rt(rt_rq, rt_se);
+ }
+
if ((flags & DEQUEUE_SLEEP) && p) {
unsigned int state;
@@ -1853,13 +1858,22 @@ static int find_lowest_rq(struct task_struct *task)
static struct task_struct *pick_next_pushable_task(struct rq *rq)
{
- struct task_struct *p;
+ struct plist_head *head = &rq->rt.pushable_tasks;
+ struct task_struct *i, *p = NULL;
if (!has_pushable_tasks(rq))
return NULL;
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
+ plist_for_each_entry(i, head, pushable_tasks) {
+ /* make sure task isn't on_cpu (possible with proxy-exec) */
+ if (!task_on_cpu(rq, i)) {
+ p = i;
+ break;
+ }
+ }
+
+ if (!p)
+ return NULL;
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
@@ -2652,7 +2666,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
{
struct rt_schedulable_data *d = data;
struct task_group *child;
- unsigned long total, sum = 0;
+ u64 total, sum = 0;
u64 period, runtime;
period = ktime_to_ns(tg->rt_bandwidth.rt_period);
@@ -2676,9 +2690,6 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
return -EBUSY;
- if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
- return -EBUSY;
-
total = to_ratio(period, runtime);
/*
@@ -2818,19 +2829,6 @@ long sched_group_rt_period(struct task_group *tg)
return rt_period_us;
}
-#ifdef CONFIG_SYSCTL
-static int sched_rt_global_constraints(void)
-{
- int ret = 0;
-
- mutex_lock(&rt_constraints_mutex);
- ret = __rt_schedulable(NULL, 0, 0);
- mutex_unlock(&rt_constraints_mutex);
-
- return ret;
-}
-#endif /* CONFIG_SYSCTL */
-
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept real-time tasks when there is no way for them to run */
@@ -2840,14 +2838,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
return 1;
}
-#else /* !CONFIG_RT_GROUP_SCHED: */
-
-#ifdef CONFIG_SYSCTL
-static int sched_rt_global_constraints(void)
-{
- return 0;
-}
-#endif /* CONFIG_SYSCTL */
#endif /* !CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SYSCTL
@@ -2859,11 +2849,14 @@ static int sched_rt_global_validate(void)
NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;
- return 0;
-}
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (!rt_group_sched_enabled())
+ return 0;
-static void sched_rt_do_global(void)
-{
+ scoped_guard(mutex, &rt_constraints_mutex)
+ return __rt_schedulable(NULL, 0, 0);
+#endif
+ return 0;
}
static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
@@ -2889,11 +2882,6 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
if (ret)
goto undo;
- ret = sched_rt_global_constraints();
- if (ret)
- goto undo;
-
- sched_rt_do_global();
sched_dl_do_global();
}
if (0) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a67c73ecdf79..88e0c93b9e21 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
-extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
+extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
@@ -684,8 +684,9 @@ struct cfs_rq {
s64 sum_w_vruntime;
u64 sum_weight;
-
u64 zero_vruntime;
+ unsigned int sum_shift;
+
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
u64 zero_vruntime_fi;
@@ -1611,15 +1612,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
extern bool raw_spin_rq_trylock(struct rq *rq)
__cond_acquires(true, __rq_lockp(rq));
-extern void raw_spin_rq_unlock(struct rq *rq)
- __releases(__rq_lockp(rq));
-
static inline void raw_spin_rq_lock(struct rq *rq)
__acquires(__rq_lockp(rq))
{
raw_spin_rq_lock_nested(rq, 0);
}
+static inline void raw_spin_rq_unlock(struct rq *rq)
+ __releases(__rq_lockp(rq))
+{
+ raw_spin_unlock(rq_lockp(rq));
+}
+
static inline void raw_spin_rq_lock_irq(struct rq *rq)
__acquires(__rq_lockp(rq))
{
@@ -1858,6 +1862,13 @@ static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
static inline void scx_rq_clock_invalidate(struct rq *rq) {}
#endif /* !CONFIG_SCHED_CLASS_EXT */
+static inline void assert_balance_callbacks_empty(struct rq *rq)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ rq->balance_callback &&
+ rq->balance_callback != &balance_push_callback);
+}
+
/*
* Lockdep annotation that avoids accidental unlocks; it's like a
* sticky/continuous lockdep_assert_held().
@@ -1874,7 +1885,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
- WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
+ assert_balance_callbacks_empty(rq);
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
@@ -2854,7 +2865,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
+ lockdep_assert(rcu_read_lock_any_held());
return rq->idle_state;
}
@@ -2901,7 +2912,7 @@ extern void init_cfs_throttle_work(struct task_struct *p);
#define MAX_BW_BITS (64 - BW_SHIFT)
#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
-extern unsigned long to_ratio(u64 period, u64 runtime);
+extern u64 to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
extern void post_init_entity_util_avg(struct task_struct *p);
@@ -3006,6 +3017,29 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static inline void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(task_rq(p) != rq);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
+ wakeup_preempt(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static inline void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ guard(rq_lock)(rq);
+ update_rq_clock(rq);
+ attach_task(rq, p);
+}
+
#ifdef CONFIG_PREEMPT_RT
# define SCHED_NR_MIGRATE_BREAK 8
#else
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index cadb0e9fe19b..b215b0ead9a6 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -911,10 +911,10 @@ err_size:
return -E2BIG;
}
-static void get_params(struct task_struct *p, struct sched_attr *attr)
+static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
{
if (task_has_dl_policy(p)) {
- __getparam_dl(p, attr);
+ __getparam_dl(p, attr, flags);
} else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
} else {
@@ -980,7 +980,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
return -ESRCH;
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
+ get_params(p, &attr, 0);
return sched_setattr(p, &attr);
}
@@ -1065,7 +1065,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
int retval;
if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags))
+ usize < SCHED_ATTR_SIZE_VER0))
return -EINVAL;
scoped_guard (rcu) {
@@ -1073,6 +1073,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!p)
return -ESRCH;
+ if (flags) {
+ if (!task_has_dl_policy(p) ||
+ flags != SCHED_GETATTR_FLAG_DL_DYNAMIC)
+ return -EINVAL;
+ }
+
retval = security_task_getscheduler(p);
if (retval)
return retval;
@@ -1080,7 +1086,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
+ get_params(p, &kattr, flags);
kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32dcddaead82..5847b83d9d55 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
#include <linux/bsearch.h>
#include "sched.h"
@@ -272,7 +273,7 @@ void rebuild_sched_domains_energy(void)
static int sched_energy_aware_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret, state;
+ int ret;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -288,8 +289,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
- state = static_branch_unlikely(&sched_energy_present);
- if (state != sysctl_sched_energy_aware)
+ if (sysctl_sched_energy_aware != sched_energy_enabled())
rebuild_sched_domains_energy();
}
@@ -387,11 +387,11 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
static void sched_energy_set(bool has_eas)
{
- if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
+ if (!has_eas && sched_energy_enabled()) {
if (sched_debug())
pr_info("%s: stopping EAS\n", __func__);
static_branch_disable_cpuslocked(&sched_energy_present);
- } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
+ } else if (has_eas && !sched_energy_enabled()) {
if (sched_debug())
pr_info("%s: starting EAS\n", __func__);
static_branch_enable_cpuslocked(&sched_energy_present);
@@ -684,6 +684,9 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
+
+ /* If sd_llc exists, sd_llc_shared should exist too. */
+ WARN_ON_ONCE(!sd->shared);
sds = sd->shared;
}
@@ -732,6 +735,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
+ /* Pick reference to parent->shared. */
+ if (parent->shared) {
+ WARN_ON_ONCE(tmp->shared);
+ tmp->shared = parent->shared;
+ parent->shared = NULL;
+ }
+
if (parent->parent) {
parent->parent->child = tmp;
parent->parent->groups->flags = tmp->flags;
@@ -781,6 +791,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
}
struct s_data {
+ struct sched_domain_shared * __percpu *sds;
struct sched_domain * __percpu *sd;
struct root_domain *rd;
};
@@ -788,6 +799,7 @@ struct s_data {
enum s_alloc {
sa_rootdomain,
sa_sd,
+ sa_sd_shared,
sa_sd_storage,
sa_none,
};
@@ -1534,6 +1546,9 @@ static void set_domain_attribute(struct sched_domain *sd,
static void __sdt_free(const struct cpumask *cpu_map);
static int __sdt_alloc(const struct cpumask *cpu_map);
+static void __sds_free(struct s_data *d, const struct cpumask *cpu_map);
+static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map);
+
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
@@ -1545,6 +1560,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
case sa_sd:
free_percpu(d->sd);
fallthrough;
+ case sa_sd_shared:
+ __sds_free(d, cpu_map);
+ fallthrough;
case sa_sd_storage:
__sdt_free(cpu_map);
fallthrough;
@@ -1560,9 +1578,11 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
if (__sdt_alloc(cpu_map))
return sa_sd_storage;
+ if (__sds_alloc(d, cpu_map))
+ return sa_sd_shared;
d->sd = alloc_percpu(struct sched_domain *);
if (!d->sd)
- return sa_sd_storage;
+ return sa_sd_shared;
d->rd = alloc_rootdomain();
if (!d->rd)
return sa_sd;
@@ -1575,21 +1595,25 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
* sched_group structure so that the subsequent __free_domain_allocs()
* will not free the data we're using.
*/
-static void claim_allocations(int cpu, struct sched_domain *sd)
+static void claim_allocations(int cpu, struct s_data *d)
{
- struct sd_data *sdd = sd->private;
+ struct sched_domain *sd;
+
+ if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref))
+ *per_cpu_ptr(d->sds, cpu) = NULL;
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
- *per_cpu_ptr(sdd->sd, cpu) = NULL;
+ for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) {
+ struct sd_data *sdd = sd->private;
- if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
- *per_cpu_ptr(sdd->sds, cpu) = NULL;
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
- *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
- *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+ }
}
#ifdef CONFIG_NUMA
@@ -1642,14 +1666,19 @@ sd_init(struct sched_domain_topology_level *tl,
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
+ u64 now = sched_clock();
- sd_weight = cpumask_weight(tl->mask(tl, cpu));
+ sd_span = sched_domain_span(sd);
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
+ sd_weight = cpumask_weight(sd_span);
+ sd_id = cpumask_first(sd_span);
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
- "wrong sd_flags in topology description\n"))
+ "wrong sd_flags in topology description\n"))
sd_flags &= TOPOLOGY_SD_FLAGS;
+ sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
*sd = (struct sched_domain){
.min_interval = sd_weight,
@@ -1679,6 +1708,7 @@ sd_init(struct sched_domain_topology_level *tl,
.newidle_call = 512,
.newidle_success = 256,
.newidle_ratio = 512,
+ .newidle_stamp = now,
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
@@ -1686,12 +1716,6 @@ sd_init(struct sched_domain_topology_level *tl,
.name = tl->name,
};
- sd_span = sched_domain_span(sd);
- cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
- sd_id = cpumask_first(sd_span);
-
- sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
-
WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
(SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
"CPU capacity asymmetry not supported on SMT\n");
@@ -1727,16 +1751,6 @@ sd_init(struct sched_domain_topology_level *tl,
sd->cache_nice_tries = 1;
}
- /*
- * For all levels sharing cache; connect a sched_domain_shared
- * instance.
- */
- if (sd->flags & SD_SHARE_LLC) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
- atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
-
sd->private = sdd;
return sd;
@@ -2372,10 +2386,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sd)
return -ENOMEM;
- sdd->sds = alloc_percpu(struct sched_domain_shared *);
- if (!sdd->sds)
- return -ENOMEM;
-
sdd->sg = alloc_percpu(struct sched_group *);
if (!sdd->sg)
return -ENOMEM;
@@ -2386,7 +2396,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
- struct sched_domain_shared *sds;
struct sched_group *sg;
struct sched_group_capacity *sgc;
@@ -2397,13 +2406,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sd, j) = sd;
- sds = kzalloc_node(sizeof(struct sched_domain_shared),
- GFP_KERNEL, cpu_to_node(j));
- if (!sds)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sds, j) = sds;
-
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sg)
@@ -2445,8 +2447,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
kfree(*per_cpu_ptr(sdd->sd, j));
}
- if (sdd->sds)
- kfree(*per_cpu_ptr(sdd->sds, j));
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
if (sdd->sgc)
@@ -2454,8 +2454,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
free_percpu(sdd->sd);
sdd->sd = NULL;
- free_percpu(sdd->sds);
- sdd->sds = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
free_percpu(sdd->sgc);
@@ -2463,6 +2461,42 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}
+static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
+{
+ int j;
+
+ d->sds = alloc_percpu(struct sched_domain_shared *);
+ if (!d->sds)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain_shared *sds;
+
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sds)
+ return -ENOMEM;
+
+ *per_cpu_ptr(d->sds, j) = sds;
+ }
+
+ return 0;
+}
+
+static void __sds_free(struct s_data *d, const struct cpumask *cpu_map)
+{
+ int j;
+
+ if (!d->sds)
+ return;
+
+ for_each_cpu(j, cpu_map)
+ kfree(*per_cpu_ptr(d->sds, j));
+
+ free_percpu(d->sds);
+ d->sds = NULL;
+}
+
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
@@ -2549,6 +2583,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
}
/*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+static void adjust_numa_imbalance(struct sched_domain *sd_llc)
+{
+ struct sched_domain *parent;
+ unsigned int imb_span = 1;
+ unsigned int imb = 0;
+ unsigned int nr_llcs;
+
+ WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+ WARN_ON(!sd_llc->parent);
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
+ */
+ nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+ if (nr_llcs == 1)
+ imb = sd_llc->parent->span_weight >> 3;
+ else
+ imb = nr_llcs;
+
+ imb = max(1U, imb);
+ sd_llc->parent->imb_numa_nr = imb;
+
+ /*
+ * Set span based on the first NUMA domain.
+ *
+ * NUMA systems always add a NODE domain before
+ * iterating the NUMA domains. Since this is before
+ * degeneration, start from sd_llc's parent's
+ * parent which is the lowest an SD_NUMA domain can
+ * be relative to sd_llc.
+ */
+ parent = sd_llc->parent->parent;
+ while (parent && !(parent->flags & SD_NUMA))
+ parent = parent->parent;
+
+ imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
+
+ /* Update the upper remainder of the topology */
+ parent = sd_llc->parent;
+ while (parent) {
+ int factor = max(1U, (parent->span_weight / imb_span));
+
+ parent->imb_numa_nr = imb * factor;
+ parent = parent->parent;
+ }
+}
+
+/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
@@ -2605,61 +2707,28 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
- /*
- * Calculate an allowed NUMA imbalance such that LLCs do not get
- * imbalanced.
- */
for_each_cpu(i, cpu_map) {
- unsigned int imb = 0;
- unsigned int imb_span = 1;
+ sd = *per_cpu_ptr(d.sd, i);
+ if (!sd)
+ continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- struct sched_domain *child = sd->child;
-
- if (!(sd->flags & SD_SHARE_LLC) && child &&
- (child->flags & SD_SHARE_LLC)) {
- struct sched_domain __rcu *top_p;
- unsigned int nr_llcs;
-
- /*
- * For a single LLC per node, allow an
- * imbalance up to 12.5% of the node. This is
- * arbitrary cutoff based two factors -- SMT and
- * memory channels. For SMT-2, the intent is to
- * avoid premature sharing of HT resources but
- * SMT-4 or SMT-8 *may* benefit from a different
- * cutoff. For memory channels, this is a very
- * rough estimate of how many channels may be
- * active and is based on recent CPUs with
- * many cores.
- *
- * For multiple LLCs, allow an imbalance
- * until multiple tasks would share an LLC
- * on one node while LLCs on another node
- * remain idle. This assumes that there are
- * enough logical CPUs per LLC to avoid SMT
- * factors and that there is a correlation
- * between LLCs and memory channels.
- */
- nr_llcs = sd->span_weight / child->span_weight;
- if (nr_llcs == 1)
- imb = sd->span_weight >> 3;
- else
- imb = nr_llcs;
- imb = max(1U, imb);
- sd->imb_numa_nr = imb;
-
- /* Set span based on the first NUMA domain. */
- top_p = sd->parent;
- while (top_p && !(top_p->flags & SD_NUMA)) {
- top_p = top_p->parent;
- }
- imb_span = top_p ? top_p->span_weight : sd->span_weight;
- } else {
- int factor = max(1U, (sd->span_weight / imb_span));
+ /* First, find the topmost SD_SHARE_LLC domain */
+ while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+ sd = sd->parent;
- sd->imb_numa_nr = imb * factor;
- }
+ if (sd->flags & SD_SHARE_LLC) {
+ int sd_id = cpumask_first(sched_domain_span(sd));
+
+ sd->shared = *per_cpu_ptr(d.sds, sd_id);
+ atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+ atomic_inc(&sd->shared->ref);
+
+ /*
+ * In presence of higher domains, adjust the
+ * NUMA imbalance stats for the hierarchy.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && sd->parent)
+ adjust_numa_imbalance(sd);
}
}
@@ -2668,10 +2737,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (!cpumask_test_cpu(i, cpu_map))
continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- claim_allocations(i, sd);
+ claim_allocations(i, &d);
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent)
init_sched_groups_capacity(i, sd);
- }
}
/* Attach the domains */