diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1828 |
1 files changed, 55 insertions, 1773 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index c9e3ab6e299e..2ffcceed8862 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -56,7 +56,6 @@ #include <linux/percpu.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> @@ -72,133 +71,20 @@ #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> -#include <linux/jump_label.h> #include <asm/tlb.h> #include <asm/irq_regs.h> -#include <asm/mutex.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif -#include "sched_cpupri.h" +#include "sched.h" #include "workqueue_sched.h" -#include "sched_autogroup.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* - * Helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT - -/* - * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define DEF_TIMESLICE (100 * HZ / 1000) - -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) - -static inline int rt_policy(int policy) -{ - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; -} - -static inline int task_has_rt_policy(struct task_struct *p) -{ - return rt_policy(p->policy); -} - -/* - * This is the priority-queue data structure of the RT scheduling class: - */ -struct rt_prio_array { - DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; -}; - -struct rt_bandwidth { - /* nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; - ktime_t rt_period; - u64 rt_runtime; - struct hrtimer rt_period_timer; -}; - -static struct rt_bandwidth def_rt_bandwidth; - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) -{ - struct rt_bandwidth *rt_b = - container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - - if (!overrun) - break; - - idle = do_sched_rt_period_timer(rt_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -static -void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) -{ - rt_b->rt_period = ns_to_ktime(period); - rt_b->rt_runtime = runtime; - - raw_spin_lock_init(&rt_b->rt_runtime_lock); - - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.function = sched_rt_period_timer; -} - -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - -static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) +void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; ktime_t soft, hard, now; @@ -218,609 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) } } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - - raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); - raw_spin_unlock(&rt_b->rt_runtime_lock); -} - -#ifdef CONFIG_RT_GROUP_SCHED -static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - hrtimer_cancel(&rt_b->rt_period_timer); -} -#endif - -/* - * sched_domains_mutex serializes calls to init_sched_domains, - * detach_destroy_domains and partition_sched_domains. - */ -static DEFINE_MUTEX(sched_domains_mutex); - -#ifdef CONFIG_CGROUP_SCHED - -#include <linux/cgroup.h> - -struct cfs_rq; - -static LIST_HEAD(task_groups); - -struct cfs_bandwidth { -#ifdef CONFIG_CFS_BANDWIDTH - raw_spinlock_t lock; - ktime_t period; - u64 quota, runtime; - s64 hierarchal_quota; - u64 runtime_expires; - - int idle, timer_active; - struct hrtimer period_timer, slack_timer; - struct list_head throttled_cfs_rq; - - /* statistics */ - int nr_periods, nr_throttled; - u64 throttled_time; -#endif -}; - -/* task group related information */ -struct task_group { - struct cgroup_subsys_state css; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; - - atomic_t load_weight; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; - - struct rt_bandwidth rt_bandwidth; -#endif - - struct rcu_head rcu; - struct list_head list; - - struct task_group *parent; - struct list_head siblings; - struct list_head children; - -#ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; -#endif - - struct cfs_bandwidth cfs_bandwidth; -}; - -/* task_group_lock serializes the addition/removal of task groups */ -static DEFINE_SPINLOCK(task_group_lock); - -#ifdef CONFIG_FAIR_GROUP_SCHED - -# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD - -/* - * A weight of 0 or 1 can cause arithmetics problems. - * A weight of a cfs_rq is the sum of weights of which entities - * are queued on this cfs_rq, so a weight of a entity should not be - * too large, so as the shares value of a task group. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ -#define MIN_SHARES (1UL << 1) -#define MAX_SHARES (1UL << 18) - -static int root_task_group_load = ROOT_TASK_GROUP_LOAD; -#endif - -/* Default task group. - * Every task in system belong to this group at bootup. - */ -struct task_group root_task_group; - -#endif /* CONFIG_CGROUP_SCHED */ - -/* CFS-related fields in a runqueue */ -struct cfs_rq { - struct load_weight load; - unsigned long nr_running, h_nr_running; - - u64 exec_clock; - u64 min_vruntime; -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; -#endif - - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; - - struct list_head tasks; - struct list_head *balance_iterator; - - /* - * 'curr' points to currently running entity on this cfs_rq. - * It is set to NULL otherwise (i.e when none are currently running). - */ - struct sched_entity *curr, *next, *last, *skip; - -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. - */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - - /* - * Maintaining per-cpu shares distribution for group scheduling - * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time - */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; - - unsigned long load_contribution; -#endif -#ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; - u64 runtime_expires; - s64 runtime_remaining; - - u64 throttled_timestamp; - int throttled, throttle_count; - struct list_head throttled_list; -#endif -#endif -}; - -#ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_CFS_BANDWIDTH -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) -{ - return &tg->cfs_bandwidth; -} - -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); - -static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) -{ - struct cfs_bandwidth *cfs_b = - container_of(timer, struct cfs_bandwidth, slack_timer); - do_sched_cfs_slack_timer(cfs_b); - - return HRTIMER_NORESTART; -} - -static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) -{ - struct cfs_bandwidth *cfs_b = - container_of(timer, struct cfs_bandwidth, period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, cfs_b->period); - - if (!overrun) - break; - - idle = do_sched_cfs_period_timer(cfs_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - raw_spin_lock_init(&cfs_b->lock); - cfs_b->runtime = 0; - cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); - - INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->period_timer.function = sched_cfs_period_timer; - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; -} - -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - cfs_rq->runtime_enabled = 0; - INIT_LIST_HEAD(&cfs_rq->throttled_list); -} - -/* requires cfs_b->lock, may release to reprogram timer */ -static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - /* - * The timer may be active because we're trying to set a new bandwidth - * period or because we're racing with the tear-down path - * (timer_active==0 becomes visible before the hrtimer call-back - * terminates). In either case we ensure that it's re-programmed - */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { - raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - - raw_spin_lock(&cfs_b->lock); - /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) - return; - } - - cfs_b->timer_active = 1; - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); -} - -static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - hrtimer_cancel(&cfs_b->period_timer); - hrtimer_cancel(&cfs_b->slack_timer); -} - -#ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; - -static inline bool cfs_bandwidth_used(void) -{ - return static_branch(&__cfs_bandwidth_used); -} - -static void account_cfs_bandwidth_used(int enabled, int was_enabled) -{ - /* only need to count groups transitioning between enabled/!enabled */ - if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); - else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); -} -#else /* !HAVE_JUMP_LABEL */ -/* static_branch doesn't help unless supported */ -static int cfs_bandwidth_used(void) -{ - return 1; -} -static void account_cfs_bandwidth_used(int enabled, int was_enabled) {} -#endif /* HAVE_JUMP_LABEL */ -#else /* !CONFIG_CFS_BANDWIDTH */ -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} -static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} - -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) -{ - return NULL; -} -#endif /* CONFIG_CFS_BANDWIDTH */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -/* Real-Time classes' related field in a runqueue: */ -struct rt_rq { - struct rt_prio_array active; - unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - struct { - int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP - int next; /* next highest */ -#endif - } highest_prio; -#endif -#ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; - int overloaded; - struct plist_head pushable_tasks; -#endif - int rt_throttled; - u64 rt_time; - u64 rt_runtime; - /* Nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; - -#ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; - - struct rq *rq; - struct list_head leaf_rt_rq_list; - struct task_group *tg; -#endif -}; - -#ifdef CONFIG_SMP - -/* - * We add the notion of a root-domain which will be used to define per-domain - * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new - * exclusive cpuset is created, we also create and attach a new root-domain - * object. - * - */ -struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; - - /* - * The "RT overload" flag: it gets set if a CPU has more than - * one runnable RT task. - */ - cpumask_var_t rto_mask; - struct cpupri cpupri; -}; - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -#endif /* CONFIG_SMP */ - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct rq { - /* runqueue lock: */ - raw_spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ - u64 nohz_stamp; - unsigned char nohz_balance_kick; -#endif - int skip_clock_update; - - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; - - struct cfs_rq cfs; - struct rt_rq rt; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; -#endif -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif - - /* - * This is part of a global counter where only the total sum - * over all CPUs matters. A task can increase this counter on - * one CPU and if it got migrated afterwards it may decrease - * it on another CPU. Always updated under the runqueue lock: - */ - unsigned long nr_uninterruptible; - - struct task_struct *curr, *idle, *stop; - unsigned long next_balance; - struct mm_struct *prev_mm; - - u64 clock; - u64 clock_task; - - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; - - unsigned long cpu_power; - - unsigned char idle_balance; - /* For active balancing */ - int post_schedule; - int active_balance; - int push_cpu; - struct cpu_stop_work active_balance_work; - /* cpu of this runqueue: */ - int cpu; - int online; - - u64 rt_avg; - u64 age_stamp; - u64 idle_stamp; - u64 avg_idle; -#endif - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; -#endif -#ifdef CONFIG_PARAVIRT - u64 prev_steal_time; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; -#endif - - /* calc_load related fields */ - unsigned long calc_load_update; - long calc_load_active; - -#ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP - int hrtick_csd_pending; - struct call_single_data hrtick_csd; -#endif - struct hrtimer hrtick_timer; -#endif - -#ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ - - /* sys_sched_yield() stats */ - unsigned int yld_count; - - /* schedule() stats */ - unsigned int sched_switch; - unsigned int sched_count; - unsigned int sched_goidle; - - /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; -#endif - -#ifdef CONFIG_SMP - struct llist_head wake_list; -#endif -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - - -static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - -static inline int cpu_of(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq->cpu; -#else - return 0; -#endif -} - -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ -#define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) - -#ifdef CONFIG_CGROUP_SCHED - -/* - * Return the group to which this tasks belongs. - * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. - */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ -#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) - struct task_group *tg = task_group(p); -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = tg->cfs_rq[cpu]; - p->se.parent = tg->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = tg->rt_rq[cpu]; - p->rt.parent = tg->rt_se[cpu]; -#endif -} - -#else /* CONFIG_CGROUP_SCHED */ - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - -#endif /* CONFIG_CGROUP_SCHED */ +DEFINE_MUTEX(sched_domains_mutex); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static void update_rq_clock_task(struct rq *rq, s64 delta); -static void update_rq_clock(struct rq *rq) +void update_rq_clock(struct rq *rq) { s64 delta; @@ -833,40 +122,10 @@ static void update_rq_clock(struct rq *rq) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: - */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug static const -#endif - -/** - * runqueue_is_locked - Returns true if the current cpu runqueue is locked - * @cpu: the processor in question. - * - * This interface allows printk to be called with the runqueue lock - * held and know whether or not it is OK to wake up the klogd. - */ -int runqueue_is_locked(int cpu) -{ - return raw_spin_is_locked(&cpu_rq(cpu)->lock); -} - -/* * Debugging: various feature bits */ #define SCHED_FEAT(name, enabled) \ - __SCHED_FEAT_##name , - -enum { -#include "sched_features.h" -}; - -#undef SCHED_FEAT - -#define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | const_debug unsigned int sysctl_sched_features = @@ -965,8 +224,6 @@ late_initcall(sched_init_debug); #endif -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. @@ -987,7 +244,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; */ unsigned int sysctl_sched_rt_period = 1000000; -static __read_mostly int scheduler_running; +__read_mostly int scheduler_running; /* * part of the period that we allow rt tasks to run in us. @@ -995,112 +252,7 @@ static __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -static inline u64 global_rt_period(void) -{ - return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; -} -static inline u64 global_rt_runtime(void) -{ - if (sysctl_sched_rt_runtime < 0) - return RUNTIME_INF; - - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; -} - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif - -static inline int task_current(struct rq *rq, struct task_struct *p) -{ - return rq->curr == p; -} - -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->on_cpu; -#else - return task_current(rq, p); -#endif -} - -#ifndef __ARCH_WANT_UNLOCKED_CTXSW -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else - raw_spin_unlock(&rq->lock); -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* * __task_rq_lock - lock the rq @p resides on. @@ -1183,20 +335,6 @@ static struct rq *this_rq_lock(void) * rq->lock. */ -/* - * Use hrtick when: - * - enabled by features - * - hrtimer is actually high res - */ -static inline int hrtick_enabled(struct rq *rq) -{ - if (!sched_feat(HRTICK)) - return 0; - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); -} - static void hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) @@ -1240,7 +378,7 @@ static void __hrtick_start(void *arg) * * called with rq->lock held and irqs disabled */ -static void hrtick_start(struct rq *rq, u64 delay) +void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); @@ -1284,7 +422,7 @@ static __init void init_hrtick(void) * * called with rq->lock held and irqs disabled */ -static void hrtick_start(struct rq *rq, u64 delay) +void hrtick_start(struct rq *rq, u64 delay) { __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); @@ -1335,7 +473,7 @@ static inline void init_hrtick(void) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void resched_task(struct task_struct *p) +void resched_task(struct task_struct *p) { int cpu; @@ -1356,7 +494,7 @@ static void resched_task(struct task_struct *p) smp_send_reschedule(cpu); } -static void resched_cpu(int cpu) +void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -1449,12 +587,7 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ */ -static u64 sched_avg_period(void) -{ - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; -} - -static void sched_avg_update(struct rq *rq) +void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); @@ -1470,193 +603,23 @@ static void sched_avg_update(struct rq *rq) } } -static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta; - sched_avg_update(rq); -} - #else /* !CONFIG_SMP */ -static void resched_task(struct task_struct *p) +void resched_task(struct task_struct *p) { assert_raw_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } - -static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ -} - -static void sched_avg_update(struct rq *rq) -{ -} #endif /* CONFIG_SMP */ -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) - -/* - * delta *= weight / lw - */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) -{ - u64 tmp; - - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; - - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; - } - - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); -} - -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} - -/* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ - -#define WEIGHT_IDLEPRIO 3 -#define WMULT_IDLEPRIO 1431655765 - -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; - -/* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. - * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: - */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; - -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - -#ifdef CONFIG_CGROUP_CPUACCT -static void cpuacct_charge(struct task_struct *tsk, u64 cputime); -static void cpuacct_update_stats(struct task_struct *tsk, - enum cpuacct_stat_index idx, cputime_t val); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -static inline void cpuacct_update_stats(struct task_struct *tsk, - enum cpuacct_stat_index idx, cputime_t val) {} -#endif - -static inline void inc_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_add(&rq->load, load); -} - -static inline void dec_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_sub(&rq->load, load); -} - #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) -typedef int (*tg_visitor)(struct task_group *, void *); - /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. * * Caller must hold rcu_lock or sufficient equivalent. */ -static int walk_tg_tree_from(struct task_group *from, +int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; @@ -1687,270 +650,13 @@ out: return ret; } -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - * - * Caller must hold rcu_lock or sufficient equivalent. - */ - -static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) -{ - return walk_tg_tree_from(&root_task_group, down, up, data); -} - -static int tg_nop(struct task_group *tg, void *data) +int tg_nop(struct task_group *tg, void *data) { return 0; } #endif -#ifdef CONFIG_SMP -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -static unsigned long power_of(int cpu) -{ - return cpu_rq(cpu)->cpu_power; -} - -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); - - if (nr_running) - return rq->load.weight / nr_running; - - return 0; -} - -#ifdef CONFIG_PREEMPT - -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - -/* - * fair double_lock_balance: Safely acquires both rq->locks in a fair - * way at the expense of forcing extra atomic operations in all - * invocations. This assures that the double_lock is acquired using the - * same underlying policy as the spinlock_t on this architecture, which - * reduces latency compared to the unfair variant below. However, it - * also adds more overhead and therefore may reduce throughput. - */ -static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - raw_spin_unlock(&this_rq->lock); - double_rq_lock(this_rq, busiest); - - return 1; -} - -#else -/* - * Unfair double_lock_balance: Optimizes throughput at the expense of - * latency by eliminating extra atomic operations when the locks are - * already in proper order on entry. This favors lower cpu-ids and will - * grant the double lock to lower cpus over higher ids under contention, - * regardless of entry order into the function. - */ -static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!raw_spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - raw_spin_unlock(&this_rq->lock); - raw_spin_lock(&busiest->lock); - raw_spin_lock_nested(&this_rq->lock, - SINGLE_DEPTH_NESTING); - ret = 1; - } else - raw_spin_lock_nested(&busiest->lock, - SINGLE_DEPTH_NESTING); - } - return ret; -} - -#endif /* CONFIG_PREEMPT */ - -/* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) -{ - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - raw_spin_unlock(&this_rq->lock); - BUG_ON(1); - } - - return _double_lock_balance(this_rq, busiest); -} - -static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - raw_spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); - else - __release(rq2->lock); -} - -#else /* CONFIG_SMP */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - BUG_ON(rq1 != rq2); - raw_spin_unlock(&rq1->lock); - __release(rq2->lock); -} - -#endif - -static void calc_load_account_idle(struct rq *this_rq); -static void update_sysctl(void); -static int get_update_sysctl_factor(void); -static void update_cpu_load(struct rq *this_rq); - -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfully executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - -static const struct sched_class rt_sched_class; - -#define sched_class_highest (&stop_sched_class) -#define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) - -#include "sched_stats.h" - -static void inc_nr_running(struct rq *rq) -{ - rq->nr_running++; -} - -static void dec_nr_running(struct rq *rq) -{ - rq->nr_running--; -} +void update_cpu_load(struct rq *this_rq); static void set_load_weight(struct task_struct *p) { @@ -1987,7 +693,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) /* * activate_task - move a task to the runqueue. */ -static void activate_task(struct rq *rq, struct task_struct *p, int flags) +void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; @@ -1998,7 +704,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; @@ -2223,15 +929,6 @@ static int irqtime_account_si_update(void) #endif -#include "sched_idletask.c" -#include "sched_fair.c" -#include "sched_rt.c" -#include "sched_autogroup.c" -#include "sched_stoptask.c" -#ifdef CONFIG_SCHED_DEBUG -# include "sched_debug.c" -#endif - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -2329,7 +1026,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { const struct sched_class *class; @@ -2355,38 +1052,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP -/* - * Is this task likely cache-hot: - */ -static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) -{ - s64 delta; - - if (p->sched_class != &fair_sched_class) - return 0; - - if (unlikely(p->policy == SCHED_IDLE)) - return 0; - - /* - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) - return 1; - - if (sysctl_sched_migration_cost == -1) - return 1; - if (sysctl_sched_migration_cost == 0) - return 0; - - delta = now - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -3469,7 +2134,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) */ static atomic_long_t calc_load_tasks_idle; -static void calc_load_account_idle(struct rq *this_rq) +void calc_load_account_idle(struct rq *this_rq) { long delta; @@ -3613,7 +2278,7 @@ static void calc_global_nohz(unsigned long ticks) */ } #else -static void calc_load_account_idle(struct rq *this_rq) +void calc_load_account_idle(struct rq *this_rq) { } @@ -3756,7 +2421,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -static void update_cpu_load(struct rq *this_rq) +void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; unsigned long curr_jiffies = jiffies; @@ -6148,53 +4813,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) #endif } -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static int get_update_sysctl_factor(void) -{ - unsigned int cpus = min_t(int, num_online_cpus(), 8); - unsigned int factor; - - switch (sysctl_sched_tunable_scaling) { - case SCHED_TUNABLESCALING_NONE: - factor = 1; - break; - case SCHED_TUNABLESCALING_LINEAR: - factor = cpus; - break; - case SCHED_TUNABLESCALING_LOG: - default: - factor = 1 + ilog2(cpus); - break; - } - - return factor; -} - -static void update_sysctl(void) -{ - unsigned int factor = get_update_sysctl_factor(); - -#define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); -#undef SET_SYSCTL -} - -static inline void sched_init_granularity(void) -{ - update_sysctl(); -} - #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { @@ -6381,30 +4999,6 @@ static void calc_global_load_remove(struct rq *rq) rq->calc_load_active = 0; } -#ifdef CONFIG_CFS_BANDWIDTH -static void unthrottle_offline_cfs_rqs(struct rq *rq) -{ - struct cfs_rq *cfs_rq; - - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - if (!cfs_rq->runtime_enabled) - continue; - - /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = cfs_b->quota; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); - } -} -#else -static void unthrottle_offline_cfs_rqs(struct rq *rq) {} -#endif - /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -7010,6 +5604,12 @@ out: return -ENOMEM; } +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +struct root_domain def_root_domain; + static void init_defrootdomain(void) { init_rootdomain(&def_root_domain); @@ -7418,6 +6018,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) update_group_power(sd, cpu); } +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -8053,29 +6658,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, } } -static int update_runtime(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -8124,104 +6706,11 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq) -{ - cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif -} - -static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, - struct sched_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - cfs_rq->tg = tg; - cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif - init_cfs_rq_runtime(cfs_rq); - - tg->cfs_rq[cpu] = cfs_rq; - tg->se[cpu] = se; - - /* se could be NULL for root_task_group */ - if (!se) - return; - - if (!parent) - se->cfs_rq = &rq->cfs; - else - se->cfs_rq = parent->my_q; - - se->my_q = cfs_rq; - update_load_set(&se->load, 0); - se->parent = parent; -} +#ifdef CONFIG_CGROUP_SCHED +struct task_group root_task_group; #endif -#ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, - struct sched_rt_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; - rt_rq->tg = tg; - - tg->rt_rq[cpu] = rt_rq; - tg->rt_se[cpu] = rt_se; - - if (!rt_se) - return; - - if (!parent) - rt_se->rt_rq = &rq->rt; - else - rt_se->rt_rq = parent->my_q; - - rt_se->my_q = rt_rq; - rt_se->parent = parent; - INIT_LIST_HEAD(&rt_se->run_list); -} -#endif +DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); void __init sched_init(void) { @@ -8294,7 +6783,7 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = root_task_group_load; + root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); /* * How much cpu bandwidth does root_task_group get? @@ -8357,10 +6846,6 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif -#ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#endif - #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters); #endif @@ -8388,17 +6873,11 @@ void __init sched_init(void) #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); -#ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); - atomic_set(&nohz.load_balancer, nr_cpu_ids); - atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); - atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); -#endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); -#endif /* SMP */ +#endif + init_sched_fair_class(); scheduler_running = 1; } @@ -8550,169 +7029,14 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void free_fair_sched_group(struct task_group *tg) -{ - int i; - - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - - kfree(tg->cfs_rq); - kfree(tg->se); -} - -static -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se; - int i; - - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->se) - goto err; - - tg->shares = NICE_0_LOAD; - - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); - - for_each_possible_cpu(i) { - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!cfs_rq) - goto err; - - se = kzalloc_node(sizeof(struct sched_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!se) - goto err_free_rq; - - init_cfs_rq(cfs_rq); - init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); - } - - return 1; - -err_free_rq: - kfree(cfs_rq); -err: - return 0; -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; - - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} -#else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline void free_fair_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED -static void free_rt_sched_group(struct task_group *tg) -{ - int i; - - if (tg->rt_se) - destroy_rt_bandwidth(&tg->rt_bandwidth); - - for_each_possible_cpu(i) { - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); - } - - kfree(tg->rt_rq); - kfree(tg->rt_se); -} - -static -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; - int i; - - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_se) - goto err; - - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); - - for_each_possible_cpu(i) { - rt_rq = kzalloc_node(sizeof(struct rt_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_rq) - goto err; - - rt_se = kzalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_se) - goto err_free_rq; - - init_rt_rq(rt_rq, cpu_rq(i)); - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); - } - - return 1; - -err_free_rq: - kfree(rt_rq); -err: - return 0; -} #else /* !CONFIG_RT_GROUP_SCHED */ -static inline void free_rt_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); + static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -8818,47 +7142,6 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static DEFINE_MUTEX(shares_mutex); - -int sched_group_set_shares(struct task_group *tg, unsigned long shares) -{ - int i; - unsigned long flags; - - /* - * We can't change the weight of the root cgroup. - */ - if (!tg->se[0]) - return -EINVAL; - - shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - - mutex_lock(&shares_mutex); - if (tg->shares == shares) - goto done; - - tg->shares = shares; - for_each_possible_cpu(i) { - struct rq *rq = cpu_rq(i); - struct sched_entity *se; - - se = tg->se[i]; - /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - -done: - mutex_unlock(&shares_mutex); - return 0; -} - -unsigned long sched_group_shares(struct task_group *tg) -{ - return tg->shares; -} #endif #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) @@ -8883,7 +7166,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg) struct task_struct *g, *p; do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + if (rt_task(p) && task_rq(p)->rt.tg == tg) return 1; } while_each_thread(g, p); @@ -9235,7 +7518,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { int i, ret = 0, runtime_enabled, runtime_was_enabled; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; if (tg == &root_task_group) return -EINVAL; @@ -9264,7 +7547,6 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); - raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -9280,13 +7562,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) for_each_possible_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; - struct rq *rq = rq_of(cfs_rq); + struct rq *rq = cfs_rq->rq; raw_spin_lock_irq(&rq->lock); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } @@ -9300,7 +7582,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { u64 quota, period; - period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + period = ktime_to_ns(tg->cfs_bandwidth.period); if (cfs_quota_us < 0) quota = RUNTIME_INF; else @@ -9313,10 +7595,10 @@ long tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; - if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) + if (tg->cfs_bandwidth.quota == RUNTIME_INF) return -1; - quota_us = tg_cfs_bandwidth(tg)->quota; + quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); return quota_us; @@ -9327,7 +7609,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) u64 quota, period; period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg_cfs_bandwidth(tg)->quota; + quota = tg->cfs_bandwidth.quota; if (period <= 0) return -EINVAL; @@ -9339,7 +7621,7 @@ long tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; - cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); do_div(cfs_period_us, NSEC_PER_USEC); return cfs_period_us; @@ -9399,13 +7681,13 @@ static u64 normalize_cfs_quota(struct task_group *tg, static int tg_cfs_schedulable_down(struct task_group *tg, void *data) { struct cfs_schedulable_data *d = data; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; s64 quota = 0, parent_quota = -1; if (!tg->parent) { quota = RUNTIME_INF; } else { - struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); + struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; quota = normalize_cfs_quota(tg, d); parent_quota = parent_b->hierarchal_quota; @@ -9449,7 +7731,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, struct cgroup_map_cb *cb) { struct task_group *tg = cgroup_tg(cgrp); - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; cb->fill(cb, "nr_periods", cfs_b->nr_periods); cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); @@ -9748,7 +8030,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) * * called with rq->lock held. */ -static void cpuacct_charge(struct task_struct *tsk, u64 cputime) +void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int cpu; @@ -9790,7 +8072,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) /* * Charge the system/user time to the task's accounting group. */ -static void cpuacct_update_stats(struct task_struct *tsk, +void cpuacct_update_stats(struct task_struct *tsk, enum cpuacct_stat_index idx, cputime_t val) { struct cpuacct *ca; |