diff options
-rw-r--r-- | Documentation/scheduler/sched-deadline.txt | 18 | ||||
-rw-r--r-- | include/linux/kernel.h | 9 | ||||
-rw-r--r-- | include/linux/sched.h | 18 | ||||
-rw-r--r-- | kernel/exit.c | 26 | ||||
-rw-r--r-- | kernel/sched/core.c | 189 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 153 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 3 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 33 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 78 | ||||
-rw-r--r-- | kernel/sched/debug.c | 103 | ||||
-rw-r--r-- | kernel/sched/fair.c | 416 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 3 | ||||
-rw-r--r-- | kernel/sched/stats.h | 24 | ||||
-rw-r--r-- | kernel/smpboot.c | 2 | ||||
-rw-r--r-- | kernel/stop_machine.c | 5 | ||||
-rw-r--r-- | mm/huge_memory.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | tools/objtool/builtin-check.c | 1 |
19 files changed, 633 insertions, 454 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 53a2fe1ae8b8..8e37b0ba2c9d 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -16,6 +16,7 @@ CONTENTS 4.1 System-wide settings 4.2 Task interface 4.3 Default behavior + 4.4 Behavior of sched_yield() 5. Tasks CPU affinity 5.1 SCHED_DEADLINE and cpusets HOWTO 6. Future plans @@ -426,6 +427,23 @@ CONTENTS Finally, notice that in order not to jeopardize the admission control a -deadline task cannot fork. + +4.4 Behavior of sched_yield() +----------------------------- + + When a SCHED_DEADLINE task calls sched_yield(), it gives up its + remaining runtime and is immediately throttled, until the next + period, when its runtime will be replenished (a special flag + dl_yielded is set and used to handle correctly throttling and runtime + replenishment after a call to sched_yield()). + + This behavior of sched_yield() allows the task to wake-up exactly at + the beginning of the next period. Also, this may be useful in the + future with bandwidth reclaiming mechanisms, where sched_yield() will + make the leftoever runtime available for reclamation by other + SCHED_DEADLINE tasks. + + 5. Tasks CPU affinity ===================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d96a6118d26a..74fd6f05bc5b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -259,17 +259,14 @@ static inline void might_fault(void) { } extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) -void panic(const char *fmt, ...) - __noreturn __cold; +void panic(const char *fmt, ...) __noreturn __cold; void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -void do_exit(long error_code) - __noreturn; -void complete_and_exit(struct completion *, long) - __noreturn; +void do_exit(long error_code) __noreturn; +void complete_and_exit(struct completion *, long) __noreturn; /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e513e39..b99fcd1b341e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -448,6 +448,8 @@ static inline void io_schedule(void) io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); } +void __noreturn do_task_dead(void); + struct nsproxy; struct user_namespace; @@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ @@ -3206,7 +3209,11 @@ static inline int signal_pending_state(long state, struct task_struct *p) * cond_resched_lock() will drop the spinlock before scheduling, * cond_resched_softirq() will enable bhs before scheduling. */ +#ifndef CONFIG_PREEMPT extern int _cond_resched(void); +#else +static inline int _cond_resched(void) { return 0; } +#endif #define cond_resched() ({ \ ___might_sleep(__FILE__, __LINE__, 0); \ @@ -3236,6 +3243,15 @@ static inline void cond_resched_rcu(void) #endif } +static inline unsigned long get_preempt_disable_ip(struct task_struct *p) +{ +#ifdef CONFIG_DEBUG_PREEMPT + return p->preempt_disable_ip; +#else + return 0; +#endif +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, diff --git a/kernel/exit.c b/kernel/exit.c index 091a78be3b09..1e1d913914c0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -725,7 +725,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -void do_exit(long code) +void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -882,29 +882,7 @@ void do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44817c640e99..8bae0cd09e9e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1063,8 +1063,12 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + if (task_rq(p) == rq) { + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + else + p->wake_cpu = arg->dest_cpu; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); @@ -1265,7 +1269,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) /* * Task isn't running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the - * previous cpu our targer instead of where it really is. + * previous cpu our target instead of where it really is. */ p->wake_cpu = cpu; } @@ -1629,23 +1633,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); + struct rq *rq; -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); + if (!schedstat_enabled()) + return; - if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - schedstat_inc(p, se.statistics.nr_wakeups_local); + rq = this_rq(); + +#ifdef CONFIG_SMP + if (cpu == rq->cpu) { + schedstat_inc(rq->ttwu_local); + schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p, se.statistics.nr_wakeups_remote); + schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); - for_each_domain(this_cpu, sd) { + for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); + schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1653,17 +1659,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - + schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq, ttwu_count); - schedstat_inc(p, se.statistics.nr_wakeups); + schedstat_inc(rq->ttwu_count); + schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - -#endif /* CONFIG_SCHEDSTATS */ + schedstat_inc(p->se.statistics.nr_wakeups_sync); } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2084,8 +2087,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); stat: - if (schedstat_enabled()) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2095,6 +2097,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened + * @cookie: context's cookie for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not @@ -2133,8 +2136,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0, cookie); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -3192,6 +3194,9 @@ static inline void preempt_latency_stop(int val) { } */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3202,13 +3207,12 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif if (panic_on_warn) panic("scheduling while atomic\n"); @@ -3234,7 +3238,7 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_count); + schedstat_inc(this_rq()->sched_count); } /* @@ -3327,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - /* - * do_exit() calls schedule() with preemption disabled as an exception; - * however we must fix that up, otherwise the next task will see an - * inconsistent (higher) preempt count. - * - * It also avoids the below schedule_debug() test from complaining - * about this. - */ - if (unlikely(prev->state == TASK_DEAD)) - preempt_enable_no_resched_notrace(); - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3405,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt) } STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ +void __noreturn do_task_dead(void) +{ + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(¤t->pi_lock); + + /* causes final put_task_struct in finish_task_switch(). */ + __set_current_state(TASK_DEAD); + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + __schedule(false); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} + static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) @@ -4846,7 +4866,7 @@ SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); /* @@ -4863,6 +4883,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +#ifndef CONFIG_PREEMPT int __sched _cond_resched(void) { if (should_resched(0)) { @@ -4872,6 +4893,7 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); +#endif /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -4997,7 +5019,7 @@ again: yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); /* * Make p's CPU reschedule; pick_next_entity takes care of * fairness. @@ -5717,6 +5739,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } } #else /* !CONFIG_SCHED_DEBUG */ + +# define sched_debug_enabled 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -5735,6 +5759,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5765,6 +5790,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | @@ -6374,23 +6400,32 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: * - * Odd one out: - * SD_ASYM_PACKING - describes SMT quirks + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies + * + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: + * + * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, + struct sched_domain *child, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); int sd_weight, sd_flags = 0; @@ -6442,6 +6477,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, + .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif @@ -6451,6 +6487,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) * Convert topological properties into behaviour. */ + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; @@ -6866,16 +6909,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; + struct sched_domain *sd = sd_init(tl, child, cpu); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { @@ -6906,6 +6946,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -6956,11 +6997,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ + if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); + if (rq && sched_debug_enabled) { + pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); + } + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); @@ -7523,10 +7575,6 @@ void __init sched_init(void) set_load_weight(&init_task); -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ @@ -7534,11 +7582,6 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - - /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again @@ -7592,6 +7635,7 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + unsigned long preempt_disable_ip; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7602,6 +7646,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; + /* Save this before calling printk(), since that will clobber it */ + preempt_disable_ip = get_preempt_disable_ip(current); + printk(KERN_ERR "BUG: sleeping function called from invalid context at %s:%d\n", file, line); @@ -7616,14 +7663,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); #endif @@ -7644,12 +7691,10 @@ void normalize_rt_tasks(void) if (p->flags & PF_KTHREAD) continue; - p->se.exec_start = 0; -#ifdef CONFIG_SCHEDSTATS - p->se.statistics.wait_start = 0; - p->se.statistics.sleep_start = 0; - p->se.statistics.block_start = 0; -#endif + p->se.exec_start = 0; + schedstat_set(p->se.statistics.wait_start, 0); + schedstat_set(p->se.statistics.sleep_start, 0); + schedstat_set(p->se.statistics.block_start, 0); if (!dl_task(p) && !rt_task(p)) { /* diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d4184498c9f5..e73119013c53 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,56 +31,81 @@ static inline int right_child(int i) return (i << 1) + 2; } -static void cpudl_exchange(struct cpudl *cp, int a, int b) +static void cpudl_heapify_down(struct cpudl *cp, int idx) { - int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + int l, r, largest; - swap(cp->elements[a].cpu, cp->elements[b].cpu); - swap(cp->elements[a].dl , cp->elements[b].dl ); + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; - swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); -} - -static void cpudl_heapify(struct cpudl *cp, int idx) -{ - int l, r, largest; + if (left_child(idx) >= cp->size) + return; /* adapted from lib/prio_heap.c */ while(1) { + u64 largest_dl; l = left_child(idx); r = right_child(idx); largest = idx; + largest_dl = orig_dl; - if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, - cp->elements[l].dl)) + if ((l < cp->size) && dl_time_before(orig_dl, + cp->elements[l].dl)) { largest = l; - if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, - cp->elements[r].dl)) + largest_dl = cp->elements[l].dl; + } + if ((r < cp->size) && dl_time_before(largest_dl, + cp->elements[r].dl)) largest = r; + if (largest == idx) break; - /* Push idx down the heap one level and bump one up */ - cpudl_exchange(cp, largest, idx); + /* pull largest child onto idx */ + cp->elements[idx].cpu = cp->elements[largest].cpu; + cp->elements[idx].dl = cp->elements[largest].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; idx = largest; } + /* actual push down of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; } -static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +static void cpudl_heapify_up(struct cpudl *cp, int idx) { - WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); + int p; - if (dl_time_before(new_dl, cp->elements[idx].dl)) { - cp->elements[idx].dl = new_dl; - cpudl_heapify(cp, idx); - } else { - cp->elements[idx].dl = new_dl; - while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, - cp->elements[idx].dl)) { - cpudl_exchange(cp, idx, parent(idx)); - idx = parent(idx); - } - } + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; + + if (idx == 0) + return; + + do { + p = parent(idx); + if (dl_time_before(orig_dl, cp->elements[p].dl)) + break; + /* pull parent onto idx */ + cp->elements[idx].cpu = cp->elements[p].cpu; + cp->elements[idx].dl = cp->elements[p].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; + idx = p; + } while (idx != 0); + /* actual push up of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; +} + +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) + cpudl_heapify_up(cp, idx); + else + cpudl_heapify_down(cp, idx); } static inline int cpudl_maximum(struct cpudl *cp) @@ -120,16 +145,15 @@ out: } /* - * cpudl_set - update the cpudl max-heap + * cpudl_clear - remove a cpu from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +void cpudl_clear(struct cpudl *cp, int cpu) { int old_idx, new_cpu; unsigned long flags; @@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->elements[cpu].idx; - if (!is_valid) { - /* remove item */ - if (old_idx == IDX_INVALID) { - /* - * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is - * called for a CPU without -dl tasks running. - */ - goto out; - } + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + } else { new_cpu = cp->elements[cp->size - 1].cpu; cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; - while (old_idx > 0 && dl_time_before( - cp->elements[parent(old_idx)].dl, - cp->elements[old_idx].dl)) { - cpudl_exchange(cp, old_idx, parent(old_idx)); - old_idx = parent(old_idx); - } - cpumask_set_cpu(cpu, cp->free_cpus); - cpudl_heapify(cp, old_idx); + cpudl_heapify(cp, old_idx); - goto out; + cpumask_set_cpu(cpu, cp->free_cpus); } + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl) +{ + int old_idx; + unsigned long flags; + WARN_ON(!cpu_present(cpu)); + + raw_spin_lock_irqsave(&cp->lock, flags); + + old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { - cp->size++; - cp->elements[cp->size - 1].dl = dl; - cp->elements[cp->size - 1].cpu = cpu; - cp->elements[cpu].idx = cp->size - 1; - cpudl_change_key(cp, cp->size - 1, dl); + int new_idx = cp->size++; + cp->elements[new_idx].dl = dl; + cp->elements[new_idx].cpu = cpu; + cp->elements[cpu].idx = new_idx; + cpudl_heapify_up(cp, new_idx); cpumask_clear_cpu(cpu, cp->free_cpus); } else { - cpudl_change_key(cp, old_idx, dl); + cp->elements[old_idx].dl = dl; + cpudl_heapify(cp, old_idx); } -out: raw_spin_unlock_irqrestore(&cp->lock, flags); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index fcbdf83fed7e..f7da8c55bba0 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -23,7 +23,8 @@ struct cpudl { #ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a846cf89eb96..b93c72d5f64f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -306,6 +306,26 @@ static inline cputime_t account_other_time(cputime_t max) return accounted; } +#ifdef CONFIG_64BIT +static inline u64 read_sum_exec_runtime(struct task_struct *t) +{ + return t->se.sum_exec_runtime; +} +#else +static u64 read_sum_exec_runtime(struct task_struct *t) +{ + u64 ns; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(t, &rf); + ns = t->se.sum_exec_runtime; + task_rq_unlock(rq, t, &rf); + + return ns; +} +#endif + /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. @@ -318,6 +338,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) unsigned int seq, nextseq; unsigned long flags; + /* + * Update current task runtime to account pending time since last + * scheduler action or thread_group_cputime() call. This thread group + * might have other running tasks on different CPUs, but updating + * their runtime can affect syscall performance, so we skip account + * those pending times and rely only on values updated on tick or + * other scheduler action. + */ + if (same_thread_group(current, tsk)) + (void) task_sched_runtime(current); + rcu_read_lock(); /* Attempt a lockless read on the first round. */ nextseq = 0; @@ -332,7 +363,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); + times->sum_exec_runtime += read_sum_exec_runtime(t); } /* If lockless access failed, take the lock. */ nextseq = 1; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1ce8867283dc..0c75bc656178 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; - bool fallback = false; later_rq = find_lock_later_rq(p, rq); - if (!later_rq) { int cpu; @@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online cpu. */ - fallback = true; cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); if (cpu >= nr_cpu_ids) { /* @@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } - /* - * By now the task is replenished and enqueued; migrate it. - */ - deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); - activate_task(later_rq, p, 0); - - if (!fallback) - resched_curr(later_rq); - double_unlock_balance(later_rq, rq); return later_rq; @@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * one, and to (try to!) reconcile itself with its own scheduling * parameters. */ -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + WARN_ON(dl_se->dl_boosted); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; + dl_se->runtime = dl_se->dl_runtime; } /* @@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); - #ifdef CONFIG_SMP - /* - * Perform balancing operations here; after the replenishments. We - * cannot drop rq->lock before this, otherwise the assertion in - * start_dl_timer() about not missing updates is not true. - * - * If we find that the rq the task was on is no longer available, we - * need to select a new rq. - * - * XXX figure out if select_task_rq_dl() deals with offline cpus. - */ if (unlikely(!rq->online)) { + /* + * If the runqueue is no longer available, migrate the + * task elsewhere. This necessarily changes rq. + */ lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); rf.cookie = lockdep_pin_lock(&rq->lock); + + /* + * Now that the task has been migrated to the new RQ and we + * have that locked, proceed as normal and enqueue the task + * there. + */ } +#endif + + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); +#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -798,7 +788,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { dl_rq->earliest_dl.curr = deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } } @@ -813,14 +803,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { struct rb_node *leftmost = dl_rq->rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } } @@ -1671,7 +1661,7 @@ static void rq_online_dl(struct rq *rq) cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) - cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); } /* Assumes rq->lock is held */ @@ -1680,7 +1670,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } @@ -1723,10 +1713,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + + /* If p is not queued we will update its parameters at next wakeup. */ + if (!task_on_rq_queued(p)) + return; + + /* + * If p is boosted we already updated its params in + * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), + * p's deadline being now already after rq_clock(rq). + */ if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl, &p->dl); + setup_new_dl_entity(&p->dl); - if (task_on_rq_queued(p) && rq->curr != p) { + if (rq->curr != p) { #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..13935886a471 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + PN_SCHEDSTAT(se->statistics.wait_start); + PN_SCHEDSTAT(se->statistics.sleep_start); + PN_SCHEDSTAT(se->statistics.block_start); + PN_SCHEDSTAT(se->statistics.sleep_max); + PN_SCHEDSTAT(se->statistics.block_max); + PN_SCHEDSTAT(se->statistics.exec_max); + PN_SCHEDSTAT(se->statistics.slice_max); + PN_SCHEDSTAT(se->statistics.wait_max); + PN_SCHEDSTAT(se->statistics.wait_sum); + P_SCHEDSTAT(se->statistics.wait_count); } -#endif P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); #endif + +#undef PN_SCHEDSTAT #undef PN +#undef P_SCHEDSTAT #undef P } #endif @@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@ -626,9 +631,7 @@ do { \ #undef P64 #endif -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { P(yld_count); P(sched_count); @@ -636,9 +639,8 @@ do { \ P(ttwu_count); P(ttwu_local); } - #undef P -#endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); @@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) #define __PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) PN(se.exec_start); PN(se.vruntime); @@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_migrations); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); + PN_SCHEDSTAT(se.statistics.wait_start); + PN_SCHEDSTAT(se.statistics.sleep_start); + PN_SCHEDSTAT(se.statistics.block_start); + PN_SCHEDSTAT(se.statistics.sleep_max); + PN_SCHEDSTAT(se.statistics.block_max); + PN_SCHEDSTAT(se.statistics.exec_max); + PN_SCHEDSTAT(se.statistics.slice_max); + PN_SCHEDSTAT(se.statistics.wait_max); + PN_SCHEDSTAT(se.statistics.wait_sum); + P_SCHEDSTAT(se.statistics.wait_count); + PN_SCHEDSTAT(se.statistics.iowait_sum); + P_SCHEDSTAT(se.statistics.iowait_count); + P_SCHEDSTAT(se.statistics.nr_migrations_cold); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); + P_SCHEDSTAT(se.statistics.nr_forced_migrations); + P_SCHEDSTAT(se.statistics.nr_wakeups); + P_SCHEDSTAT(se.statistics.nr_wakeups_sync); + P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); + P_SCHEDSTAT(se.statistics.nr_wakeups_local); + P_SCHEDSTAT(se.statistics.nr_wakeups_remote); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); + P_SCHEDSTAT(se.statistics.nr_wakeups_passive); + P_SCHEDSTAT(se.statistics.nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) __PN(avg_atom); __PN(avg_per_cpu); } -#endif + __P(nr_switches); SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); @@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); +#undef PN_SCHEDSTAT #undef PN #undef __PN +#undef P_SCHEDSTAT #undef P #undef __P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..8fb4d1942c14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * 1024 < capacity * margin + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -656,7 +662,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -726,7 +732,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -759,10 +764,9 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } #else /* !CONFIG_SMP */ @@ -799,7 +803,7 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); + schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); update_min_vruntime(cfs_rq); @@ -820,26 +824,34 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } -#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start = rq_clock(rq_of(cfs_rq)); + u64 wait_start, prev_wait_start; + + if (!schedstat_enabled()) + return; + + wait_start = rq_clock(rq_of(cfs_rq)); + prev_wait_start = schedstat_val(se->statistics.wait_start); if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > se->statistics.wait_start)) - wait_start -= se->statistics.wait_start; + likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; - se->statistics.wait_start = wait_start; + schedstat_set(se->statistics.wait_start, wait_start); } -static void +static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; u64 delta; - delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + if (!schedstat_enabled()) + return; + + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { p = task_of(se); @@ -849,35 +861,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - se->statistics.wait_start = delta; + schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - se->statistics.wait_max = max(se->statistics.wait_max, delta); - se->statistics.wait_count++; - se->statistics.wait_sum += delta; - se->statistics.wait_start = 0; + schedstat_set(se->statistics.wait_max, + max(schedstat_val(se->statistics.wait_max), delta)); + schedstat_inc(se->statistics.wait_count); + schedstat_add(se->statistics.wait_sum, delta); + schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *tsk = NULL; + u64 sleep_start, block_start; + + if (!schedstat_enabled()) + return; + + sleep_start = schedstat_val(se->statistics.sleep_start); + block_start = schedstat_val(se->statistics.block_start); + + if (entity_is_task(se)) + tsk = task_of(se); + + if (sleep_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) + schedstat_set(se->statistics.sleep_max, delta); + + schedstat_set(se->statistics.sleep_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (block_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(se->statistics.block_max))) + schedstat_set(se->statistics.block_max, delta); + + schedstat_set(se->statistics.block_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); + + if (tsk) { + if (tsk->in_iowait) { + schedstat_add(se->statistics.iowait_sum, delta); + schedstat_inc(se->statistics.iowait_count); + trace_sched_stat_iowait(tsk, delta); + } + + trace_sched_stat_blocked(tsk, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } } /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + if (!schedstat_enabled()) + return; + /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper(cfs_rq, se); } static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + + if (!schedstat_enabled()) + return; + /* * Mark the end of the wait period if dequeueing a * waiting task: @@ -885,40 +976,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + struct task_struct *tsk = task_of(se); - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } + if (tsk->state & TASK_INTERRUPTIBLE) + schedstat_set(se->statistics.sleep_start, + rq_clock(rq_of(cfs_rq))); + if (tsk->state & TASK_UNINTERRUPTIBLE) + schedstat_set(se->statistics.block_start, + rq_clock(rq_of(cfs_rq))); } - -} -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ } -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ -} -#endif - /* * We are picking a new current task - update its stats: */ @@ -1514,7 +1583,8 @@ balance: * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: task_numa_assign(env, cur, imp); @@ -2803,9 +2873,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -2931,10 +3013,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed utilization. It is expected - * that one calls update_tg_load_avg() on this condition, but after you've - * modified the cfs_rq avg (attach/detach), such that we propagate the new - * avg up. + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -3183,68 +3265,6 @@ static inline int idle_balance(struct rq *rq) #endif /* CONFIG_SMP */ -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS - struct task_struct *tsk = NULL; - - if (entity_is_task(se)) - tsk = task_of(se); - - if (se->statistics.sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; - - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (se->statistics.block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; - - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } -#endif -} - static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHED_DEBUG @@ -3254,7 +3274,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) d = -d; if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); + schedstat_inc(cfs_rq->nr_spread_over); #endif } @@ -3371,17 +3391,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); - if (flags & ENQUEUE_WAKEUP) { + if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); - if (schedstat_enabled()) - enqueue_sleeper(cfs_rq, se); - } check_schedstat_required(); - if (schedstat_enabled()) { - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - } + update_stats_enqueue(cfs_rq, se, flags); + check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3448,8 +3463,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - if (schedstat_enabled()) - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3523,25 +3537,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - if (schedstat_enabled()) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS + /* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->statistics.slice_max = max(se->statistics.slice_max, - se->sum_exec_runtime - se->prev_sum_exec_runtime); + schedstat_set(se->statistics.slice_max, + max((u64)schedstat_val(se->statistics.slice_max), + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } -#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -3620,13 +3634,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - if (schedstat_enabled()) { - check_spread(cfs_rq, prev); - if (prev->on_rq) - update_stats_wait_start(cfs_rq, prev); - } + check_spread(cfs_rq, prev); if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -4458,7 +4469,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -5091,18 +5102,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -5146,13 +5157,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (!balanced) return 0; - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); return 1; } @@ -5228,6 +5239,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { if (idle_cpu(i)) { @@ -5267,11 +5282,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); if (idle_cpu(target)) return target; @@ -5279,8 +5293,8 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + return prev; /* * Otherwise, iterate the domains and find an eligible idle cpu. @@ -5301,6 +5315,8 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; + if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; @@ -5360,6 +5376,32 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } +static inline int task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5383,7 +5425,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); } rcu_read_lock(); @@ -5409,13 +5452,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { struct sched_group *group; @@ -5939,7 +5982,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * The adjacency matrix of the resulting graph is given by: * - * log_2 n + * log_2 n * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) * k = 0 * @@ -5985,7 +6028,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * [XXX write more on how we solve this.. _after_ merging pjt's patches that * rewrite all of this once again.] - */ + */ static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -6133,7 +6176,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int cpu; - schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->se.statistics.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -6164,7 +6207,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { - schedstat_inc(p, se.statistics.nr_failed_migrations_running); + schedstat_inc(p->se.statistics.nr_failed_migrations_running); return 0; } @@ -6181,13 +6224,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->se.statistics.nr_forced_migrations); } return 1; } - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->se.statistics.nr_failed_migrations_hot); return 0; } @@ -6227,7 +6270,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) * so we can safely collect stats here rather than * inside detach_tasks(). */ - schedstat_inc(env->sd, lb_gained[env->idle]); + schedstat_inc(env->sd->lb_gained[env->idle]); return p; } return NULL; @@ -6319,7 +6362,7 @@ next: * so we can safely collect detach_one_task() stats here rather * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], detached); + schedstat_add(env->sd->lb_gained[env->idle], detached); return detached; } @@ -6647,7 +6690,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) /* * !SD_OVERLAP domains can assume that child groups * span the current group. - */ + */ group = child->groups; do { @@ -7147,7 +7190,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - load_above_capacity *= NICE_0_LOAD; + load_above_capacity *= scale_load_down(NICE_0_LOAD); load_above_capacity /= busiest->group_capacity; } else load_above_capacity = ~0UL; @@ -7460,7 +7503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - schedstat_inc(sd, lb_count[idle]); + schedstat_inc(sd->lb_count[idle]); redo: if (!should_we_balance(&env)) { @@ -7470,19 +7513,19 @@ redo: group = find_busiest_group(&env); if (!group) { - schedstat_inc(sd, lb_nobusyg[idle]); + schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(&env, group); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[idle]); + schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == env.dst_rq); - schedstat_add(sd, lb_imbalance[idle], env.imbalance); + schedstat_add(sd->lb_imbalance[idle], env.imbalance); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -7589,7 +7632,7 @@ more_balance: } if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + schedstat_inc(sd->lb_failed[idle]); /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very @@ -7672,7 +7715,7 @@ out_all_pinned: * we can't migrate them. Let the imbalance flag set so parent level * can try to migrate them. */ - schedstat_inc(sd, lb_balanced[idle]); + schedstat_inc(sd->lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -7704,11 +7747,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) } static inline void -update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +update_next_balance(struct sched_domain *sd, unsigned long *next_balance) { unsigned long interval, next; - interval = get_sd_balance_interval(sd, cpu_busy); + /* used by idle balance, so cpu_busy = 0 */ + interval = get_sd_balance_interval(sd, 0); next = sd->last_balance + interval; if (time_after(*next_balance, next)) @@ -7738,7 +7782,7 @@ static int idle_balance(struct rq *this_rq) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); goto out; @@ -7756,7 +7800,7 @@ static int idle_balance(struct rq *this_rq) continue; if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); break; } @@ -7774,7 +7818,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); /* * Stop searching for tasks to pull if there are @@ -7864,15 +7908,15 @@ static int active_load_balance_cpu_stop(void *data) .idle = CPU_IDLE, }; - schedstat_inc(sd, alb_count); + schedstat_inc(sd->alb_count); p = detach_one_task(&env); if (p) { - schedstat_inc(sd, alb_pushed); + schedstat_inc(sd->alb_pushed); /* Active balancing done, reset the failure counter. */ sd->nr_balance_failed = 0; } else { - schedstat_inc(sd, alb_failed); + schedstat_inc(sd->alb_failed); } } rcu_read_unlock(); @@ -8441,7 +8485,6 @@ static void detach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (!vruntime_normalized(p)) { /* @@ -8453,10 +8496,9 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) @@ -8464,7 +8506,6 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8475,10 +8516,9 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 2ce5458bbe1d..dedc81ecbb2e 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -28,7 +28,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie c { put_prev_task(rq, prev); - schedstat_inc(rq, sched_goidle); + schedstat_inc(rq->sched_goidle); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..420c05d099c3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -565,6 +565,8 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -597,7 +599,6 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_load_update_tick; #endif /* CONFIG_SMP */ - u64 nohz_stamp; unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 78955cbea31c..34659a853505 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) -# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) -# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) -# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS */ static inline void @@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} -# define schedstat_enabled() 0 -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -# define schedstat_set(var, val) do { } while (0) -# define schedstat_val(rq, field) 0 -#endif +#define schedstat_enabled() 0 +#define schedstat_inc(var) do { } while (0) +#define schedstat_add(var, amt) do { } while (0) +#define schedstat_set(var, val) do { } while (0) +#define schedstat_val(var) 0 +#define schedstat_val_or_zero(var) 0 +#endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d1fb22..fc0d8270f69e 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) if (kthread_should_park()) { __set_current_state(TASK_RUNNING); - preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); ht->park(td->cpu); td->status = HP_THREAD_PARKED; } + preempt_enable(); kthread_parkme(); /* We might have been woken for stop */ continue; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..082e71f17a58 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -126,6 +126,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) return -ENOENT; + /* + * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup + * cycle by doing a preemption: + */ + cond_resched(); wait_for_completion(&done.completion); return done.ret; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 53ae6d00656a..283583fcb1e7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1165,7 +1165,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) } /* See similar comment in do_numa_page for explanation */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pmd_write(pmd)) flags |= TNF_NO_GROUP; /* diff --git a/mm/memory.c b/mm/memory.c index 793fe0f9841c..f1a68049edff 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3395,7 +3395,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pte_write(pte)) flags |= TNF_NO_GROUP; /* diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index bd09d0effef8..143b6cdd7f06 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -175,6 +175,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func, "__stack_chk_fail", "panic", "do_exit", + "do_task_dead", "__module_put_and_exit", "complete_and_exit", "kvm_spurious_fault", |