summaryrefslogtreecommitdiff
path: root/kernel/smp.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-06-03 13:06:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-06-03 13:06:42 -0700
commitd479c5a1919b4e569dcd3ae9c84ed74a675d0b94 (patch)
tree99d0ad6f712b152991a5846034a184ba857beed7 /kernel/smp.c
parentf6aee505c71bbb035dde146caf5a6abbf3ccbe47 (diff)
parent25de110d148666752dc0e0da7a0b69de31cd7098 (diff)
downloadlwn-d479c5a1919b4e569dcd3ae9c84ed74a675d0b94.tar.gz
lwn-d479c5a1919b4e569dcd3ae9c84ed74a675d0b94.zip
Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The changes in this cycle are: - Optimize the task wakeup CPU selection logic, to improve scalability and reduce wakeup latency spikes - PELT enhancements - CFS bandwidth handling fixes - Optimize the wakeup path by remove rq->wake_list and replacing it with ->ttwu_pending - Optimize IPI cross-calls by making flush_smp_call_function_queue() process sync callbacks first. - Misc fixes and enhancements" * tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too sched/headers: Split out open-coded prototypes into kernel/sched/smp.h sched: Replace rq::wake_list sched: Add rq::ttwu_pending irq_work, smp: Allow irq_work on call_single_queue smp: Optimize send_call_function_single_ipi() smp: Move irq_work_run() out of flush_smp_call_function_queue() smp: Optimize flush_smp_call_function_queue() sched: Fix smp_call_function_single_async() usage for ILB sched/core: Offload wakee task activation if it the wakee is descheduling sched/core: Optimize ttwu() spinning on p->on_cpu sched: Defend cfs and rt bandwidth quota against overflow sched/cpuacct: Fix charge cpuacct.usage_sys sched/fair: Replace zero-length array with flexible-array sched/pelt: Sync util/runnable_sum with PELT window when propagating sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr() sched/fair: Optimize enqueue_task_fair() sched: Make scheduler_ipi inline sched: Clean up scheduler_ipi() sched/core: Simplify sched_init() ...
Diffstat (limited to 'kernel/smp.c')
-rw-r--r--kernel/smp.c175
1 files changed, 132 insertions, 43 deletions
diff --git a/kernel/smp.c b/kernel/smp.c
index 84303197caf9..472c2b274c65 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -22,11 +22,9 @@
#include <linux/hypervisor.h>
#include "smpboot.h"
+#include "sched/smp.h"
-enum {
- CSD_FLAG_LOCK = 0x01,
- CSD_FLAG_SYNCHRONOUS = 0x02,
-};
+#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
struct call_function_data {
call_single_data_t __percpu *csd;
@@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* still pending.
*/
flush_smp_call_function_queue(false);
+ irq_work_run();
return 0;
}
@@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
+void __smp_call_single_queue(int cpu, struct llist_node *node)
+{
+ /*
+ * The list addition should be visible before sending the IPI
+ * handler locks the list to pull the entry off it because of
+ * normal cache coherency rules implied by spinlocks.
+ *
+ * If IPIs can go out of order to the cache coherency protocol
+ * in an architecture, sufficient synchronisation should be added
+ * to arch code to make it appear to obey cache coherency WRT
+ * locking and barrier primitives. Generic code isn't really
+ * equipped to do the right thing...
+ */
+ if (llist_add(node, &per_cpu(call_single_queue, cpu)))
+ send_call_function_single_ipi(cpu);
+}
+
/*
* Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, call_single_data_t *csd,
- smp_call_func_t func, void *info)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
{
if (cpu == smp_processor_id()) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
unsigned long flags;
/*
@@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
return 0;
}
-
if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
csd_unlock(csd);
return -ENXIO;
}
- csd->func = func;
- csd->info = info;
-
- /*
- * The list addition should be visible before sending the IPI
- * handler locks the list to pull the entry off it because of
- * normal cache coherency rules implied by spinlocks.
- *
- * If IPIs can go out of order to the cache coherency protocol
- * in an architecture, sufficient synchronisation should be added
- * to arch code to make it appear to obey cache coherency WRT
- * locking and barrier primitives. Generic code isn't really
- * equipped to do the right thing...
- */
- if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
- arch_send_call_function_single_ipi(cpu);
+ __smp_call_single_queue(cpu, &csd->llist);
return 0;
}
@@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void)
*/
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
- struct llist_head *head;
- struct llist_node *entry;
call_single_data_t *csd, *csd_next;
+ struct llist_node *entry, *prev;
+ struct llist_head *head;
static bool warned;
lockdep_assert_irqs_disabled();
@@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* We don't have to use the _safe() variant here
* because we are not invoking the IPI handlers yet.
*/
- llist_for_each_entry(csd, entry, llist)
- pr_warn("IPI callback %pS sent to offline CPU\n",
- csd->func);
+ llist_for_each_entry(csd, entry, llist) {
+ switch (CSD_TYPE(csd)) {
+ case CSD_TYPE_ASYNC:
+ case CSD_TYPE_SYNC:
+ case CSD_TYPE_IRQ_WORK:
+ pr_warn("IPI callback %pS sent to offline CPU\n",
+ csd->func);
+ break;
+
+ case CSD_TYPE_TTWU:
+ pr_warn("IPI task-wakeup sent to offline CPU\n");
+ break;
+
+ default:
+ pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
+ CSD_TYPE(csd));
+ break;
+ }
+ }
}
+ /*
+ * First; run all SYNC callbacks, people are waiting for us.
+ */
+ prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
- smp_call_func_t func = csd->func;
- void *info = csd->info;
-
/* Do we wait until *after* callback? */
- if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ if (prev) {
+ prev->next = &csd_next->llist;
+ } else {
+ entry = &csd_next->llist;
+ }
+
func(info);
csd_unlock(csd);
} else {
- csd_unlock(csd);
- func(info);
+ prev = &csd->llist;
}
}
+ if (!entry)
+ return;
+
/*
- * Handle irq works queued remotely by irq_work_queue_on().
- * Smp functions above are typically synchronous so they
- * better run first since some other CPUs may be busy waiting
- * for them.
+ * Second; run all !SYNC callbacks.
*/
- irq_work_run();
+ prev = NULL;
+ llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+ int type = CSD_TYPE(csd);
+
+ if (type != CSD_TYPE_TTWU) {
+ if (prev) {
+ prev->next = &csd_next->llist;
+ } else {
+ entry = &csd_next->llist;
+ }
+
+ if (type == CSD_TYPE_ASYNC) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ csd_unlock(csd);
+ func(info);
+ } else if (type == CSD_TYPE_IRQ_WORK) {
+ irq_work_single(csd);
+ }
+
+ } else {
+ prev = &csd->llist;
+ }
+ }
+
+ /*
+ * Third; only CSD_TYPE_TTWU is left, issue those.
+ */
+ if (entry)
+ sched_ttwu_pending(entry);
+}
+
+void flush_smp_call_function_from_idle(void)
+{
+ unsigned long flags;
+
+ if (llist_empty(this_cpu_ptr(&call_single_queue)))
+ return;
+
+ local_irq_save(flags);
+ flush_smp_call_function_queue(true);
+ local_irq_restore(flags);
}
/*
@@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
{
call_single_data_t *csd;
call_single_data_t csd_stack = {
- .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+ .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
};
int this_cpu;
int err;
@@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd_lock(csd);
}
- err = generic_exec_single(cpu, csd, func, info);
+ csd->func = func;
+ csd->info = info;
+
+ err = generic_exec_single(cpu, csd);
if (wait)
csd_lock_wait(csd);
@@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
csd->flags = CSD_FLAG_LOCK;
smp_wmb();
- err = generic_exec_single(cpu, csd, csd->func, csd->info);
+ err = generic_exec_single(cpu, csd);
out:
preempt_enable();
@@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd_lock(csd);
if (wait)
- csd->flags |= CSD_FLAG_SYNCHRONOUS;
+ csd->flags |= CSD_TYPE_SYNC;
csd->func = func;
csd->info = info;
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
@@ -598,6 +669,24 @@ void __init smp_init(void)
{
int num_nodes, num_cpus;
+ /*
+ * Ensure struct irq_work layout matches so that
+ * flush_smp_call_function_queue() can do horrible things.
+ */
+ BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
+ offsetof(struct __call_single_data, llist));
+ BUILD_BUG_ON(offsetof(struct irq_work, func) !=
+ offsetof(struct __call_single_data, func));
+ BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
+ offsetof(struct __call_single_data, flags));
+
+ /*
+ * Assert the CSD_TYPE_TTWU layout is similar enough
+ * for task_struct to be on the @call_single_queue.
+ */
+ BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
+ offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
+
idle_threads_init();
cpuhp_threads_init();