summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-24 11:11:13 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-24 11:11:13 -0700
commit6f3f04c19074972ea12edeed23b07a32894e9e03 (patch)
treeca338b88e521564176ea34f832aa8de261a5437c /kernel
parentcfeb2522c3777923c2bd24772b914195dc93c86b (diff)
parent991d8d8142cad94f9c5c05db25e67fa83d6f772a (diff)
downloadlwn-6f3f04c19074972ea12edeed23b07a32894e9e03.tar.gz
lwn-6f3f04c19074972ea12edeed23b07a32894e9e03.zip
Merge tag 'sched-core-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Updates to scheduler metrics: - PELT fixes & enhancements - PSI fixes & enhancements - Refactor cpu_util_without() - Updates to instrumentation/debugging: - Remove sched_trace_*() helper functions - can be done via debug info - Fix double update_rq_clock() warnings - Introduce & use "preemption model accessors" to simplify some of the Kconfig complexity. - Make softirq handling RT-safe. - Misc smaller fixes & cleanups. * tag 'sched-core-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: topology: Remove unused cpu_cluster_mask() sched: Reverse sched_class layout sched/deadline: Remove superfluous rq clock update in push_dl_task() sched/core: Avoid obvious double update_rq_clock warning smp: Make softirq handling RT safe in flush_smp_call_function_queue() smp: Rename flush_smp_call_function_from_idle() sched: Fix missing prototype warnings sched/fair: Remove cfs_rq_tg_path() sched/fair: Remove sched_trace_*() helper functions sched/fair: Refactor cpu_util_without() sched/fair: Revise comment about lb decision matrix sched/psi: report zeroes for CPU full at the system level sched/fair: Delete useless condition in tg_unthrottle_up() sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq sched/fair: Move calculate of avg_load to a better location mailmap: Update my email address to @redhat.com MAINTAINERS: Add myself as scheduler topology reviewer psi: Fix trigger being fired unexpectedly at initial ftrace: Use preemption model accessors for trace header printout kcsan: Use preemption model accessors
Diffstat (limited to 'kernel')
-rw-r--r--kernel/kcsan/kcsan_test.c5
-rw-r--r--kernel/sched/build_policy.c2
-rw-r--r--kernel/sched/build_utility.c1
-rw-r--r--kernel/sched/core.c23
-rw-r--r--kernel/sched/deadline.c15
-rw-r--r--kernel/sched/fair.c292
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/psi.c18
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h55
-rw-r--r--kernel/sched/smp.h6
-rw-r--r--kernel/smp.c32
-rw-r--r--kernel/softirq.c13
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/trace/trace.c14
16 files changed, 190 insertions, 299 deletions
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index a36fca063a73..767dfacd6ed3 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
else
nthreads *= 2;
- if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+ if (!preempt_model_preemptible() ||
+ !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
/*
* Without any preemption, keep 2 CPUs free for other tasks, one
* of which is the main test case function checking for
* completion or failure.
*/
- const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
+ const long min_unused_cpus = preempt_model_none() ? 2 : 0;
const long min_required_cpus = 2 + min_unused_cpus;
if (num_online_cpus() < min_required_cpus) {
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index e0104b45029a..d9dc9ab3773f 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -15,6 +15,7 @@
/* Headers: */
#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/hotplug.h>
#include <linux/sched/posix-timers.h>
#include <linux/sched/rt.h>
@@ -31,6 +32,7 @@
#include <uapi/linux/sched/types.h>
#include "sched.h"
+#include "smp.h"
#include "autogroup.h"
#include "stats.h"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index eec0849b2aae..99bdd96f454f 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -14,6 +14,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
+#include <linux/sched/nohz.h>
#include <linux/sched/mm.h>
#include <linux/sched/rseq_api.h>
#include <linux/sched/task_stack.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a05096559a2..a247f8d9d417 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,7 +26,10 @@
#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
@@ -610,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);
raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(rq1, rq2);
}
#endif
@@ -2190,7 +2193,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- else if (p->sched_class > rq->curr->sched_class)
+ else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
/*
@@ -2408,7 +2411,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -5689,7 +5692,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely(prev->sched_class <= &fair_sched_class &&
+ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -9469,11 +9472,11 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
- BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
- &fair_sched_class + 1 != &rt_sched_class ||
- &rt_sched_class + 1 != &dl_sched_class);
+ BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
+ &fair_sched_class != &rt_sched_class + 1 ||
+ &rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
- BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+ BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
wait_bit_init();
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb4255ae0b2c..936817ae142f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0);
}
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
/*
* This function implements the GRUB accounting rule:
* according to the GRUB reclaiming algorithm, the runtime is
@@ -1832,6 +1830,7 @@ out:
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
+ struct rq_flags rf;
struct rq *rq;
if (READ_ONCE(p->__state) != TASK_WAKING)
@@ -1843,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_rq_lock(rq);
+ rq_lock(rq, &rf);
if (p->dl.dl_non_contending) {
update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
@@ -1859,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_rq_unlock(rq);
+ rq_unlock(rq, &rf);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -2319,13 +2318,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
-
- /*
- * Update the later_rq clock here, because the clock is used
- * by the cpufreq_update_util() inside __add_running_bw().
- */
- update_rq_clock(later_rq);
- activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+ activate_task(later_rq, next_task, 0);
ret = 1;
resched_curr(later_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a68482d66535..906b2c7c48d1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -36,6 +36,7 @@
#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
@@ -313,19 +314,6 @@ const struct sched_class fair_sched_class;
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (!path)
- return;
-
- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
- autogroup_path(cfs_rq->tg, path, len);
- else if (cfs_rq && cfs_rq->tg->css.cgroup)
- cgroup_path(cfs_rq->tg->css.cgroup, path, len);
- else
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -493,12 +481,6 @@ static int se_is_idle(struct sched_entity *se)
#define for_each_sched_entity(se) \
for (; se; se = NULL)
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (path)
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
@@ -4846,11 +4828,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- cfs_rq->throttled_clock_task;
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
/* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
+ if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
@@ -4864,7 +4846,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
}
cfs_rq->throttle_count++;
@@ -5308,7 +5290,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -6544,108 +6526,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
/*
- * cpu_util_without: compute cpu utilization without any contributions from *p
- * @cpu: the CPU which utilization is requested
- * @p: the task which utilization should be discounted
- *
- * The utilization of a CPU is defined by the utilization of tasks currently
- * enqueued on that CPU as well as tasks which are currently sleeping after an
- * execution on that CPU.
- *
- * This method returns the utilization of the specified CPU by discounting the
- * utilization of the specified task, whenever the task is currently
- * contributing to the CPU utilization.
- */
-static unsigned long cpu_util_without(int cpu, struct task_struct *p)
-{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
- /* Task has no contribution or is new */
- if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util_cfs(cpu);
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- /* Discount task's util from CPU's util */
- lsub_positive(&util, task_util(p));
-
- /*
- * Covered cases:
- *
- * a) if *p is the only task sleeping on this CPU, then:
- * cpu_util (== task_util) > util_est (== 0)
- * and thus we return:
- * cpu_util_without = (cpu_util - task_util) = 0
- *
- * b) if other tasks are SLEEPING on this CPU, which is now exiting
- * IDLE, then:
- * cpu_util >= task_util
- * cpu_util > util_est (== 0)
- * and thus we discount *p's blocked utilization to return:
- * cpu_util_without = (cpu_util - task_util) >= 0
- *
- * c) if other tasks are RUNNABLE on that CPU and
- * util_est > cpu_util
- * then we use util_est since it returns a more restrictive
- * estimation of the spare capacity on that CPU, by just
- * considering the expected utilization of tasks already
- * runnable on that CPU.
- *
- * Cases a) and b) are covered by the above code, while case c) is
- * covered by the following code when estimated utilization is
- * enabled.
- */
- if (sched_feat(UTIL_EST)) {
- unsigned int estimated =
- READ_ONCE(cfs_rq->avg.util_est.enqueued);
-
- /*
- * Despite the following checks we still have a small window
- * for a possible race, when an execl's select_task_rq_fair()
- * races with LB's detach_task():
- *
- * detach_task()
- * p->on_rq = TASK_ON_RQ_MIGRATING;
- * ---------------------------------- A
- * deactivate_task() \
- * dequeue_task() + RaceTime
- * util_est_dequeue() /
- * ---------------------------------- B
- *
- * The additional check on "current == p" it's required to
- * properly fix the execl regression and it helps in further
- * reducing the chances for the above race.
- */
- if (unlikely(task_on_rq_queued(p) || current == p))
- lsub_positive(&estimated, _task_util_est(p));
-
- util = max(util, estimated);
- }
-
- /*
- * Utilization (estimated) can exceed the CPU capacity, thus let's
- * clamp to the maximum CPU capacity to ensure consistency with
- * cpu_util.
- */
- return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
-/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
+ * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
+ * (@dst_cpu = -1) or migrated to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
/*
- * If @p migrates from @cpu to another, remove its contribution. Or,
- * if @p migrates from another CPU to @cpu, add its contribution. In
- * the other cases, @cpu is not impacted by the migration, so the
- * util_avg should already be correct.
+ * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+ * contribution. If @p migrates from another CPU to @cpu add its
+ * contribution. In all the other cases @cpu is not impacted by the
+ * migration so its util_avg is already correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
@@ -6653,16 +6546,40 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
+ unsigned long util_est;
+
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
- * During wake-up, the task isn't enqueued yet and doesn't
- * appear in the cfs_rq->avg.util_est.enqueued of any rq,
- * so just add it (if needed) to "simulate" what will be
- * cpu_util after the task has been enqueued.
+ * During wake-up @p isn't enqueued yet and doesn't contribute
+ * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+ * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+ * has been enqueued.
+ *
+ * During exec (@dst_cpu = -1) @p is enqueued and does
+ * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+ * Remove it to "simulate" cpu_util without @p's contribution.
+ *
+ * Despite the task_on_rq_queued(@p) check there is still a
+ * small window for a possible race when an exec
+ * select_task_rq_fair() races with LB's detach_task().
+ *
+ * detach_task()
+ * deactivate_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * -------------------------------- A
+ * dequeue_task() \
+ * dequeue_task_fair() + Race Time
+ * util_est_dequeue() /
+ * -------------------------------- B
+ *
+ * The additional check "current == p" is required to further
+ * reduce the race window.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
+ else if (unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
}
@@ -6671,6 +6588,28 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
}
/*
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
+ */
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_util_cfs(cpu);
+
+ return cpu_util_next(cpu, p, -1);
+}
+
+/*
* compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of @pd's CPUs after the task migration, and uses the Energy Model
@@ -9460,8 +9399,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
- sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
- sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
@@ -9470,6 +9407,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->imbalance = 0;
return;
}
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
}
/*
@@ -9495,7 +9435,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
- * misfit_task force N/A N/A N/A force force
+ * misfit_task force N/A N/A N/A N/A N/A
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
@@ -11881,101 +11821,3 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
-
-/*
- * Helper functions to facilitate extracting info from tracepoints.
- */
-
-const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
-{
-#ifdef CONFIG_SMP
- return cfs_rq ? &cfs_rq->avg : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
-
-char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
-{
- if (!cfs_rq) {
- if (str)
- strlcpy(str, "(null)", len);
- else
- return NULL;
- }
-
- cfs_rq_tg_path(cfs_rq, str, len);
- return str;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
-
-int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
-{
- return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
-
-const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_rt : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
-
-const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_dl : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
-
-const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
- return rq ? &rq->avg_irq : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
-
-int sched_trace_rq_cpu(struct rq *rq)
-{
- return rq ? cpu_of(rq) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
-
-int sched_trace_rq_cpu_capacity(struct rq *rq)
-{
- return rq ?
-#ifdef CONFIG_SMP
- rq->cpu_capacity
-#else
- SCHED_CAPACITY_SCALE
-#endif
- : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
-
-const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
-{
-#ifdef CONFIG_SMP
- return rd ? rd->span : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rd_span);
-
-int sched_trace_rq_nr_running(struct rq *rq)
-{
- return rq ? rq->nr_running : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f85d7c7caab2..328cccbee444 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -327,7 +327,7 @@ static void do_idle(void)
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index c336f5f481bc..4ff2ed4f8fa1 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq)
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+ return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
- return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a4fa3aadfcba..a337f3e35997 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) {
- unsigned long avg[3];
- u64 total;
+ unsigned long avg[3] = { 0, };
+ u64 total = 0;
int w;
- for (w = 0; w < 3; w++)
- avg[w] = group->avg[res * 2 + full][w];
- total = div_u64(group->total[PSI_AVGS][res * 2 + full],
- NSEC_PER_USEC);
+ /* CPU FULL is undefined at the system level */
+ if (!(group == &psi_system && res == PSI_CPU && full)) {
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg[res * 2 + full][w];
+ total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+ NSEC_PER_USEC);
+ }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some",
@@ -1117,7 +1120,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->state = state;
t->threshold = threshold_us * NSEC_PER_USEC;
t->win.size = window_us * NSEC_PER_USEC;
- window_reset(&t->win, 0, 0, 0);
+ window_reset(&t->win, sched_clock(),
+ group->total[PSI_POLL][t->state], 0);
t->event = 0;
t->last_event_time = 0;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a32c46889af8..7891c0f0e1ff 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq_flags rf;
int skip;
/*
@@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
- raw_spin_rq_lock(rq);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_rq_unlock(rq);
+ rq_unlock(rq, &rf);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8dccb34eb190..2ce18584dca3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -603,8 +603,8 @@ struct cfs_rq {
s64 runtime_remaining;
u64 throttled_clock;
- u64 throttled_clock_task;
- u64 throttled_clock_task_time;
+ u64 throttled_clock_pelt;
+ u64 throttled_clock_pelt_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
@@ -1827,12 +1827,7 @@ static inline void dirty_sched_domain_sysctl(int cpu)
#endif
extern int sched_update_scaling(void);
-
-extern void flush_smp_call_function_from_idle(void);
-
-#else /* !CONFIG_SMP: */
-static inline void flush_smp_call_function_from_idle(void) { }
-#endif
+#endif /* CONFIG_SMP */
#include "stats.h"
@@ -2182,6 +2177,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
*
* include/asm-generic/vmlinux.lds.h
*
+ * *CAREFUL* they are laid out in *REVERSE* order!!!
+ *
* Also enforce alignment on the instance, not the type, to guarantee layout.
*/
#define DEFINE_SCHED_CLASS(name) \
@@ -2190,17 +2187,16 @@ const struct sched_class name##_sched_class \
__section("__" #name "_sched_class")
/* Defined in include/asm-generic/vmlinux.lds.h */
-extern struct sched_class __begin_sched_classes[];
-extern struct sched_class __end_sched_classes[];
-
-#define sched_class_highest (__end_sched_classes - 1)
-#define sched_class_lowest (__begin_sched_classes - 1)
+extern struct sched_class __sched_class_highest[];
+extern struct sched_class __sched_class_lowest[];
#define for_class_range(class, _from, _to) \
- for (class = (_from); class != (_to); class--)
+ for (class = (_from); class < (_to); class++)
#define for_each_class(class) \
- for_class_range(class, sched_class_highest, sched_class_lowest)
+ for_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define sched_class_above(_a, _b) ((_a) < (_b))
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -2309,6 +2305,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
@@ -2478,6 +2475,24 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
+ * acquire rq lock instead of rq_lock(). So at the end of these two functions
+ * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
+ * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
+ */
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
+{
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
+#ifdef CONFIG_SMP
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
+}
+#else
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+#endif
#ifdef CONFIG_SMP
@@ -2543,14 +2558,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- if (__rq_lockp(this_rq) == __rq_lockp(busiest))
- return 0;
-
- if (likely(raw_spin_rq_trylock(busiest)))
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+ likely(raw_spin_rq_trylock(busiest))) {
+ double_rq_clock_clear_update(this_rq, busiest);
return 0;
+ }
if (rq_order_less(this_rq, busiest)) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(this_rq, busiest);
return 0;
}
@@ -2644,6 +2660,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
BUG_ON(rq1 != rq2);
raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
+ double_rq_clock_clear_update(rq1, rq2);
}
/*
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 9620e323162c..2eb23dd0f285 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -7,3 +7,9 @@
extern void sched_ttwu_pending(void *arg);
extern void send_call_function_single_ipi(int cpu);
+
+#ifdef CONFIG_SMP
+extern void flush_smp_call_function_queue(void);
+#else
+static inline void flush_smp_call_function_queue(void) { }
+#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index df9393aeae28..dd215f439426 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
-static void flush_smp_call_function_queue(bool warn_cpu_offline);
+static void __flush_smp_call_function_queue(bool warn_cpu_offline);
int smpcfd_prepare_cpu(unsigned int cpu)
{
@@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* ensure that the outgoing CPU doesn't go offline with work
* still pending.
*/
- flush_smp_call_function_queue(false);
+ __flush_smp_call_function_queue(false);
irq_work_run();
return 0;
}
@@ -544,11 +544,11 @@ void generic_smp_call_function_single_interrupt(void)
{
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_GOTIPI);
- flush_smp_call_function_queue(true);
+ __flush_smp_call_function_queue(true);
}
/**
- * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
*
* @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
* offline CPU. Skip this check if set to 'false'.
@@ -561,7 +561,7 @@ void generic_smp_call_function_single_interrupt(void)
* Loop through the call_single_queue and run all the queued callbacks.
* Must be called with interrupts disabled.
*/
-static void flush_smp_call_function_queue(bool warn_cpu_offline)
+static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{
call_single_data_t *csd, *csd_next;
struct llist_node *entry, *prev;
@@ -684,8 +684,22 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
smp_processor_id(), CFD_SEQ_HDLEND);
}
-void flush_smp_call_function_from_idle(void)
+
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ * from task context (idle, migration thread)
+ *
+ * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
+ * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
+ * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
+ * handle queued SMP function calls before scheduling.
+ *
+ * The migration thread has to ensure that an eventually pending wakeup has
+ * been handled before it migrates a task.
+ */
+void flush_smp_call_function_queue(void)
{
+ unsigned int was_pending;
unsigned long flags;
if (llist_empty(this_cpu_ptr(&call_single_queue)))
@@ -694,9 +708,11 @@ void flush_smp_call_function_from_idle(void)
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_IDLE);
local_irq_save(flags);
- flush_smp_call_function_queue(true);
+ /* Get the already pending soft interrupts for RT enabled kernels */
+ was_pending = local_softirq_pending();
+ __flush_smp_call_function_queue(true);
if (local_softirq_pending())
- do_softirq();
+ do_softirq_post_smp_call_flush(was_pending);
local_irq_restore(flags);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fac801815554..9f0aef8aa9ff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -294,6 +294,19 @@ static inline void invoke_softirq(void)
wakeup_softirqd();
}
+/*
+ * flush_smp_call_function_queue() can raise a soft interrupt in a function
+ * call. On RT kernels this is undesired and the only known functionality
+ * in the block layer which does this is disabled on RT. If soft interrupts
+ * get raised which haven't been raised before the flush, warn so it can be
+ * investigated.
+ */
+void do_softirq_post_smp_call_flush(unsigned int was_pending)
+{
+ if (WARN_ON_ONCE(was_pending != local_softirq_pending()))
+ invoke_softirq();
+}
+
#else /* CONFIG_PREEMPT_RT */
/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0887b3af609e..cedb17ba158a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -535,8 +535,6 @@ void stop_machine_park(int cpu)
kthread_park(stopper->thread);
}
-extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-
static void cpu_stop_create(unsigned int cpu)
{
sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f4de111fa18f..124f1897fd56 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4289,17 +4289,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
-#if defined(CONFIG_PREEMPT_NONE)
- "server",
-#elif defined(CONFIG_PREEMPT_VOLUNTARY)
- "desktop",
-#elif defined(CONFIG_PREEMPT)
- "preempt",
-#elif defined(CONFIG_PREEMPT_RT)
- "preempt_rt",
-#else
+ preempt_model_none() ? "server" :
+ preempt_model_voluntary() ? "desktop" :
+ preempt_model_full() ? "preempt" :
+ preempt_model_rt() ? "preempt_rt" :
"unknown",
-#endif
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP