diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 13:06:42 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 13:06:42 -0700 |
commit | d479c5a1919b4e569dcd3ae9c84ed74a675d0b94 (patch) | |
tree | 99d0ad6f712b152991a5846034a184ba857beed7 /kernel/smp.c | |
parent | f6aee505c71bbb035dde146caf5a6abbf3ccbe47 (diff) | |
parent | 25de110d148666752dc0e0da7a0b69de31cd7098 (diff) | |
download | lwn-d479c5a1919b4e569dcd3ae9c84ed74a675d0b94.tar.gz lwn-d479c5a1919b4e569dcd3ae9c84ed74a675d0b94.zip |
Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The changes in this cycle are:
- Optimize the task wakeup CPU selection logic, to improve
scalability and reduce wakeup latency spikes
- PELT enhancements
- CFS bandwidth handling fixes
- Optimize the wakeup path by remove rq->wake_list and replacing it
with ->ttwu_pending
- Optimize IPI cross-calls by making flush_smp_call_function_queue()
process sync callbacks first.
- Misc fixes and enhancements"
* tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too
sched/headers: Split out open-coded prototypes into kernel/sched/smp.h
sched: Replace rq::wake_list
sched: Add rq::ttwu_pending
irq_work, smp: Allow irq_work on call_single_queue
smp: Optimize send_call_function_single_ipi()
smp: Move irq_work_run() out of flush_smp_call_function_queue()
smp: Optimize flush_smp_call_function_queue()
sched: Fix smp_call_function_single_async() usage for ILB
sched/core: Offload wakee task activation if it the wakee is descheduling
sched/core: Optimize ttwu() spinning on p->on_cpu
sched: Defend cfs and rt bandwidth quota against overflow
sched/cpuacct: Fix charge cpuacct.usage_sys
sched/fair: Replace zero-length array with flexible-array
sched/pelt: Sync util/runnable_sum with PELT window when propagating
sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr()
sched/fair: Optimize enqueue_task_fair()
sched: Make scheduler_ipi inline
sched: Clean up scheduler_ipi()
sched/core: Simplify sched_init()
...
Diffstat (limited to 'kernel/smp.c')
-rw-r--r-- | kernel/smp.c | 175 |
1 files changed, 132 insertions, 43 deletions
diff --git a/kernel/smp.c b/kernel/smp.c index 84303197caf9..472c2b274c65 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -22,11 +22,9 @@ #include <linux/hypervisor.h> #include "smpboot.h" +#include "sched/smp.h" -enum { - CSD_FLAG_LOCK = 0x01, - CSD_FLAG_SYNCHRONOUS = 0x02, -}; +#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; @@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * still pending. */ flush_smp_call_function_queue(false); + irq_work_run(); return 0; } @@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd) static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); +void __smp_call_single_queue(int cpu, struct llist_node *node) +{ + /* + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... + */ + if (llist_add(node, &per_cpu(call_single_queue, cpu))) + send_call_function_single_ipi(cpu); +} + /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd, - smp_call_func_t func, void *info) +static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { + smp_call_func_t func = csd->func; + void *info = csd->info; unsigned long flags; /* @@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, return 0; } - if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } - csd->func = func; - csd->info = info; - - /* - * The list addition should be visible before sending the IPI - * handler locks the list to pull the entry off it because of - * normal cache coherency rules implied by spinlocks. - * - * If IPIs can go out of order to the cache coherency protocol - * in an architecture, sufficient synchronisation should be added - * to arch code to make it appear to obey cache coherency WRT - * locking and barrier primitives. Generic code isn't really - * equipped to do the right thing... - */ - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - arch_send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &csd->llist); return 0; } @@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void) */ static void flush_smp_call_function_queue(bool warn_cpu_offline) { - struct llist_head *head; - struct llist_node *entry; call_single_data_t *csd, *csd_next; + struct llist_node *entry, *prev; + struct llist_head *head; static bool warned; lockdep_assert_irqs_disabled(); @@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ - llist_for_each_entry(csd, entry, llist) - pr_warn("IPI callback %pS sent to offline CPU\n", - csd->func); + llist_for_each_entry(csd, entry, llist) { + switch (CSD_TYPE(csd)) { + case CSD_TYPE_ASYNC: + case CSD_TYPE_SYNC: + case CSD_TYPE_IRQ_WORK: + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + break; + + case CSD_TYPE_TTWU: + pr_warn("IPI task-wakeup sent to offline CPU\n"); + break; + + default: + pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", + CSD_TYPE(csd)); + break; + } + } } + /* + * First; run all SYNC callbacks, people are waiting for us. + */ + prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; - /* Do we wait until *after* callback? */ - if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } + func(info); csd_unlock(csd); } else { - csd_unlock(csd); - func(info); + prev = &csd->llist; } } + if (!entry) + return; + /* - * Handle irq works queued remotely by irq_work_queue_on(). - * Smp functions above are typically synchronous so they - * better run first since some other CPUs may be busy waiting - * for them. + * Second; run all !SYNC callbacks. */ - irq_work_run(); + prev = NULL; + llist_for_each_entry_safe(csd, csd_next, entry, llist) { + int type = CSD_TYPE(csd); + + if (type != CSD_TYPE_TTWU) { + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } + + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } + + } else { + prev = &csd->llist; + } + } + + /* + * Third; only CSD_TYPE_TTWU is left, issue those. + */ + if (entry) + sched_ttwu_pending(entry); +} + +void flush_smp_call_function_from_idle(void) +{ + unsigned long flags; + + if (llist_empty(this_cpu_ptr(&call_single_queue))) + return; + + local_irq_save(flags); + flush_smp_call_function_queue(true); + local_irq_restore(flags); } /* @@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, { call_single_data_t *csd; call_single_data_t csd_stack = { - .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }; int this_cpu; int err; @@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd_lock(csd); } - err = generic_exec_single(cpu, csd, func, info); + csd->func = func; + csd->info = info; + + err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); @@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) csd->flags = CSD_FLAG_LOCK; smp_wmb(); - err = generic_exec_single(cpu, csd, csd->func, csd->info); + err = generic_exec_single(cpu, csd); out: preempt_enable(); @@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd_lock(csd); if (wait) - csd->flags |= CSD_FLAG_SYNCHRONOUS; + csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) @@ -598,6 +669,24 @@ void __init smp_init(void) { int num_nodes, num_cpus; + /* + * Ensure struct irq_work layout matches so that + * flush_smp_call_function_queue() can do horrible things. + */ + BUILD_BUG_ON(offsetof(struct irq_work, llnode) != + offsetof(struct __call_single_data, llist)); + BUILD_BUG_ON(offsetof(struct irq_work, func) != + offsetof(struct __call_single_data, func)); + BUILD_BUG_ON(offsetof(struct irq_work, flags) != + offsetof(struct __call_single_data, flags)); + + /* + * Assert the CSD_TYPE_TTWU layout is similar enough + * for task_struct to be on the @call_single_queue. + */ + BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != + offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); + idle_threads_init(); cpuhp_threads_init(); |