summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c1145
1 files changed, 1130 insertions, 15 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69ba882681c5..8e858ca6bcd0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1321,6 +1321,8 @@ void post_init_entity_util_avg(struct task_struct *p)
sa->runnable_avg = sa->util_avg;
}
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
+
static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
@@ -1343,6 +1345,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
trace_sched_stat_runtime(running, delta_exec);
account_group_exec_runtime(running, delta_exec);
+ account_mm_sched(rq, running, delta_exec);
/* cgroup time is always accounted against the donor */
cgroup_account_cputime(donor, delta_exec);
@@ -1364,6 +1367,581 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
static void set_next_buddy(struct sched_entity *se);
+#ifdef CONFIG_SCHED_CACHE
+
+/*
+ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
+ * tunable or so.
+ */
+#define EPOCH_PERIOD (HZ / 100) /* 10 ms */
+#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */
+__read_mostly unsigned int llc_aggr_tolerance = 1;
+__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
+__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_imb_pct = 20;
+__read_mostly unsigned int llc_overaggr_pct = 50;
+
+static int llc_id(int cpu)
+{
+ if (cpu < 0)
+ return -1;
+
+ return per_cpu(sd_llc_id, cpu);
+}
+
+static inline int get_sched_cache_scale(int mul)
+{
+ unsigned int tol = READ_ONCE(llc_aggr_tolerance);
+
+ if (!tol)
+ return 0;
+
+ if (tol >= 100)
+ return INT_MAX;
+
+ return (1 + (tol - 1) * mul);
+}
+
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned long llc, footprint;
+ struct sched_domain *sd;
+ int scale;
+
+ guard(rcu)();
+
+ sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd);
+ if (!sd)
+ return true;
+
+ if (static_branch_likely(&sched_numa_balancing)) {
+ /*
+ * TBD: RDT exclusive LLC ways reserved should be
+ * excluded.
+ */
+ llc = sd->llc_bytes;
+ footprint = READ_ONCE(mm->sc_stat.footprint);
+
+ /*
+ * Scale the LLC size by 256*llc_aggr_tolerance
+ * and compare it to the task's footprint.
+ *
+ * Suppose the L3 size is 32MB. If the
+ * llc_aggr_tolerance is 1:
+ * When the footprint is larger than 32MB, the
+ * process is regarded as exceeding the LLC
+ * capacity. If the llc_aggr_tolerance is 99:
+ * When the footprint is larger than 784GB, the
+ * process is regarded as exceeding the LLC
+ * capacity:
+ * 784GB = (1 + (99 - 1) * 256) * 32MB
+ * If the llc_aggr_tolerance is 100:
+ * ignore the footprint and do the aggregation
+ * anyway.
+ */
+ scale = get_sched_cache_scale(256);
+ if (scale == INT_MAX)
+ return false;
+
+ return ((llc * (u64)scale) < (footprint * PAGE_SIZE));
+ }
+#endif
+ return false;
+}
+
+static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
+ int cpu)
+{
+ int scale;
+
+ if (get_nr_threads(p) <= 1)
+ return true;
+
+ /*
+ * Scale the number of 'cores' in a LLC by llc_aggr_tolerance
+ * and compare it to the task's active threads.
+ */
+ scale = get_sched_cache_scale(1);
+ if (scale == INT_MAX)
+ return false;
+
+ return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
+ (scale * per_cpu(sd_llc_size, cpu)));
+}
+
+static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+{
+ int pref_llc, pref_llc_queued;
+ struct sched_domain *sd;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ pref_llc_queued = (pref_llc == task_llc(p));
+ rq->nr_llc_running++;
+ rq->nr_pref_llc_running += pref_llc_queued;
+
+ /*
+ * Record whether p is enqueued on its preferred
+ * LLC, in order to pair with account_llc_dequeue()
+ * to maintain a consistent nr_pref_llc_running per
+ * runqueue.
+ * This is necessary because a race condition exists:
+ * after a task is enqueued on a runqueue, task_llc(p)
+ * may change due to CPU hotplug. Therefore, checking
+ * task_llc(p) to determine whether the task is being
+ * dequeued from its preferred LLC is unreliable and
+ * can cause inconsistent values - checking the
+ * p->pref_llc_queued in account_llc_dequeue() would
+ * be reliable.
+ */
+ p->pref_llc_queued = pref_llc_queued;
+
+ sd = rcu_dereference_all(rq->sd);
+ if (sd && (unsigned int)pref_llc < sd->llc_max)
+ sd->llc_counts[pref_llc]++;
+}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+{
+ struct sched_domain *sd;
+ int pref_llc;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ rq->nr_llc_running--;
+ if (p->pref_llc_queued) {
+ rq->nr_pref_llc_running--;
+ /*
+ * Update the status in case
+ * other logic might query
+ * this.
+ */
+ p->pref_llc_queued = 0;
+ }
+
+ sd = rcu_dereference_all(rq->sd);
+ if (sd && (unsigned int)pref_llc < sd->llc_max) {
+ /*
+ * There is a race condition between dequeue
+ * and CPU hotplug. After a task has been enqueued
+ * on CPUx, a CPU hotplug event occurs, and all online
+ * CPUs (including CPUx) rebuild their sched_domains
+ * and reset statistics to zero(including sd->llc_counts).
+ * This can cause temporary undercount and we have to
+ * check for such underflow in sd->llc_counts.
+ *
+ * This undercount is temporary and accurate accounting
+ * will resume once the rq has a chance to be idle.
+ */
+ if (sd->llc_counts[pref_llc])
+ sd->llc_counts[pref_llc]--;
+ }
+}
+
+void mm_init_sched(struct mm_struct *mm,
+ struct sched_cache_time __percpu *_pcpu_sched)
+{
+ unsigned long epoch = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+ struct rq *rq = cpu_rq(i);
+
+ pcpu_sched->runtime = 0;
+ /* a slightly stale cpu epoch is acceptible */
+ pcpu_sched->epoch = rq->cpu_epoch;
+ epoch = rq->cpu_epoch;
+ }
+
+ raw_spin_lock_init(&mm->sc_stat.lock);
+ mm->sc_stat.epoch = epoch;
+ mm->sc_stat.cpu = -1;
+ mm->sc_stat.next_scan = jiffies;
+ mm->sc_stat.nr_running_avg = 0;
+ mm->sc_stat.footprint = 0;
+ /*
+ * The update to mm->sc_stat should not be reordered
+ * before initialization to mm's other fields, in case
+ * the readers may get invalid mm_sched_epoch, etc.
+ */
+ smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched);
+}
+
+/* because why would C be fully specified */
+static __always_inline void __shr_u64(u64 *val, unsigned int n)
+{
+ if (n >= 64) {
+ *val = 0;
+ return;
+ }
+ *val >>= n;
+}
+
+static inline void __update_mm_sched(struct rq *rq,
+ struct sched_cache_time *pcpu_sched)
+{
+ lockdep_assert_held(&rq->cpu_epoch_lock);
+
+ unsigned int period = max(READ_ONCE(llc_epoch_period), 1U);
+ unsigned long n, now = jiffies;
+ long delta = now - rq->cpu_epoch_next;
+
+ if (delta > 0) {
+ n = (delta + period - 1) / period;
+ rq->cpu_epoch += n;
+ rq->cpu_epoch_next += n * period;
+ __shr_u64(&rq->cpu_runtime, n);
+ }
+
+ n = rq->cpu_epoch - pcpu_sched->epoch;
+ if (n) {
+ pcpu_sched->epoch += n;
+ __shr_u64(&pcpu_sched->runtime, n);
+ }
+}
+
+static unsigned long fraction_mm_sched(struct rq *rq,
+ struct sched_cache_time *pcpu_sched)
+{
+ guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+
+ __update_mm_sched(rq, pcpu_sched);
+
+ /*
+ * Runtime is a geometric series (r=0.5) and as such will sum to twice
+ * the accumulation period, this means the multiplcation here should
+ * not overflow.
+ */
+ return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+}
+
+static int get_pref_llc(struct task_struct *p, struct mm_struct *mm)
+{
+ int mm_sched_llc = -1, mm_sched_cpu;
+
+ if (!mm)
+ return -1;
+
+ mm_sched_cpu = READ_ONCE(mm->sc_stat.cpu);
+ if (mm_sched_cpu != -1) {
+ mm_sched_llc = llc_id(mm_sched_cpu);
+
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Don't assign preferred LLC if it
+ * conflicts with NUMA balancing.
+ * This can happen when sched_setnuma() gets
+ * called, however it is not much of an issue
+ * because we expect account_mm_sched() to get
+ * called fairly regularly -- at a higher rate
+ * than sched_setnuma() at least -- and thus the
+ * conflict only exists for a short period of time.
+ */
+ if (static_branch_likely(&sched_numa_balancing) &&
+ p->numa_preferred_nid >= 0 &&
+ cpu_to_node(mm_sched_cpu) != p->numa_preferred_nid)
+ mm_sched_llc = -1;
+#endif
+ }
+
+ return mm_sched_llc;
+}
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+ struct sched_cache_time *pcpu_sched;
+ struct mm_struct *mm = p->mm;
+ int mm_sched_llc = -1;
+ unsigned long epoch;
+
+ if (!sched_cache_enabled())
+ return;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+ /*
+ * init_task, kthreads and user thread created
+ * by user_mode_thread() don't have mm.
+ */
+ if (!mm || !mm->sc_stat.pcpu_sched)
+ return;
+
+ pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu_of(rq));
+
+ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+ __update_mm_sched(rq, pcpu_sched);
+ pcpu_sched->runtime += delta_exec;
+ rq->cpu_runtime += delta_exec;
+ epoch = rq->cpu_epoch;
+ }
+
+ /*
+ * If this process hasn't hit task_cache_work() for a while invalidate
+ * its preferred state.
+ */
+ if ((long)(epoch - READ_ONCE(mm->sc_stat.epoch)) > llc_epoch_affinity_timeout ||
+ invalid_llc_nr(mm, p, cpu_of(rq)) ||
+ exceed_llc_capacity(mm, cpu_of(rq))) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+ }
+
+ mm_sched_llc = get_pref_llc(p, mm);
+
+ /* task not on rq accounted later in account_entity_enqueue() */
+ if (task_running_on_cpu(rq->cpu, p) &&
+ READ_ONCE(p->preferred_llc) != mm_sched_llc) {
+ account_llc_dequeue(rq, p);
+ WRITE_ONCE(p->preferred_llc, mm_sched_llc);
+ account_llc_enqueue(rq, p);
+ }
+}
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p)
+{
+ struct callback_head *work = &p->cache_work;
+ struct mm_struct *mm = p->mm;
+ unsigned long epoch;
+
+ if (!sched_cache_enabled())
+ return;
+
+ if (!mm || p->flags & PF_KTHREAD ||
+ !mm->sc_stat.pcpu_sched)
+ return;
+
+ epoch = rq->cpu_epoch;
+ /* avoid moving backwards */
+ if (time_after_eq(mm->sc_stat.epoch, epoch))
+ return;
+
+ guard(raw_spinlock)(&mm->sc_stat.lock);
+
+ if (work->next == work) {
+ task_work_add(p, work, TWA_RESUME);
+ WRITE_ONCE(mm->sc_stat.epoch, epoch);
+ }
+}
+
+static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ int cpu, curr_cpu, nid, pref_nid;
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ goto out;
+
+ cpu = READ_ONCE(p->mm->sc_stat.cpu);
+ if (cpu != -1)
+ nid = cpu_to_node(cpu);
+ curr_cpu = task_cpu(p);
+
+ /*
+ * Scanning in the preferred NUMA node is ideal. However, the NUMA
+ * preferred node is per-task rather than per-process. It is possible
+ * for different threads of the process to have distinct preferred
+ * nodes; consequently, the process-wide preferred LLC may bounce
+ * between different nodes. As a workaround, maintain the scan
+ * CPU mask to also cover the process's current preferred LLC and the
+ * current running node to mitigate the bouncing risk.
+ * TBD: numa_group should be considered during task aggregation.
+ */
+ pref_nid = p->numa_preferred_nid;
+ /* honor the task's preferred node */
+ if (pref_nid == NUMA_NO_NODE)
+ goto out;
+
+ cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
+
+ /* honor the task's preferred LLC CPU */
+ if (cpu != -1 && !cpumask_test_cpu(cpu, cpus) && nid != NUMA_NO_NODE)
+ cpumask_or(cpus, cpus, cpumask_of_node(nid));
+
+ /* make sure the task's current running node is included */
+ if (!cpumask_test_cpu(curr_cpu, cpus))
+ cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
+
+ return;
+
+out:
+#endif
+ cpumask_copy(cpus, cpu_online_mask);
+}
+
+static inline void update_avg_scale(u64 *avg, u64 sample)
+{
+ int factor = per_cpu(sd_llc_size, raw_smp_processor_id());
+ s64 diff = sample - *avg;
+ u32 divisor;
+
+ /*
+ * Scale the divisor based on the number of CPUs contained
+ * in the LLC. This scaling ensures smaller LLC domains use
+ * a smaller divisor to achieve more precise sensitivity to
+ * changes in nr_running, while larger LLC domains are capped
+ * at a maximum divisor of 8 which is the default smoothing
+ * factor of EWMA in update_avg().
+ */
+ divisor = clamp_t(u32, (factor >> 2), 2, 8);
+ *avg += div64_s64(diff, divisor);
+}
+
+static void task_cache_work(struct callback_head *work)
+{
+ int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
+ unsigned long next_scan, now = jiffies;
+ struct task_struct *p = current, *cur;
+ unsigned long curr_m_a_occ = 0;
+ struct mm_struct *mm = p->mm;
+ unsigned long m_a_occ = 0;
+ cpumask_var_t cpus;
+
+ WARN_ON_ONCE(work != &p->cache_work);
+
+ work->next = work;
+
+ if (p->flags & PF_EXITING)
+ return;
+
+ next_scan = READ_ONCE(mm->sc_stat.next_scan);
+ if (time_before(now, next_scan))
+ return;
+
+ /* only 1 thread is allowed to scan */
+ if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan,
+ now + max_t(unsigned long,
+ READ_ONCE(llc_epoch_period), 1)))
+ return;
+
+ curr_cpu = task_cpu(p);
+ if (invalid_llc_nr(mm, p, curr_cpu) ||
+ exceed_llc_capacity(mm, curr_cpu)) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+
+ return;
+ }
+
+ if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ return;
+
+ scoped_guard (cpus_read_lock) {
+ guard(rcu)();
+
+ get_scan_cpumasks(cpus, p);
+
+ for_each_cpu(cpu, cpus) {
+ /* XXX sched_cluster_active */
+ struct sched_domain *sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
+ unsigned long occ, m_occ = 0, a_occ = 0;
+ int m_cpu = -1, i;
+
+ if (!sd)
+ continue;
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ occ = fraction_mm_sched(cpu_rq(i),
+ per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ a_occ += occ;
+ if (occ > m_occ) {
+ m_occ = occ;
+ m_cpu = i;
+ }
+
+ cur = rcu_dereference_all(cpu_rq(i)->curr);
+ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
+ cur->mm == mm)
+ nr_running++;
+ }
+
+ /*
+ * Compare the accumulated occupancy of each LLC. The
+ * reason for using accumulated occupancy rather than average
+ * per CPU occupancy is that it works better in asymmetric LLC
+ * scenarios.
+ * For example, if there are 2 threads in a 4CPU LLC and 3
+ * threads in an 8CPU LLC, it might be better to choose the one
+ * with 3 threads. However, this would not be the case if the
+ * occupancy is divided by the number of CPUs in an LLC (i.e.,
+ * if average per CPU occupancy is used).
+ * Besides, NUMA balancing fault statistics behave similarly:
+ * the total number of faults per node is compared rather than
+ * the average number of faults per CPU. This strategy is also
+ * followed here.
+ */
+ if (a_occ > m_a_occ) {
+ m_a_occ = a_occ;
+ m_a_cpu = m_cpu;
+ }
+
+ if (llc_id(cpu) == llc_id(READ_ONCE(mm->sc_stat.cpu)))
+ curr_m_a_occ = a_occ;
+
+ cpumask_andnot(cpus, cpus, sched_domain_span(sd));
+ }
+ }
+
+ if (m_a_occ > (2 * curr_m_a_occ)) {
+ /*
+ * Avoid switching sc_stat.cpu too fast.
+ * The reason to choose 2X is because:
+ * 1. It is better to keep the preferred LLC stable,
+ * rather than changing it frequently and cause migrations
+ * 2. 2X means the new preferred LLC has at least 1 more
+ * busy CPU than the old one(200% vs 100%, eg)
+ * 3. 2X is chosen based on test results, as it delivers
+ * the optimal performance gain so far.
+ */
+ WRITE_ONCE(mm->sc_stat.cpu, m_a_cpu);
+ }
+
+ update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
+ free_cpumask_var(cpus);
+}
+
+void init_sched_mm(struct task_struct *p)
+{
+ struct callback_head *work = &p->cache_work;
+
+ init_task_work(work, task_cache_work);
+ work->next = work;
+ /*
+ * Reset new task's preference to avoid
+ * polluting account_llc_enqueue().
+ */
+ p->preferred_llc = -1;
+}
+
+#else /* CONFIG_SCHED_CACHE */
+
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+ s64 delta_exec) { }
+
+void init_sched_mm(struct task_struct *p) { }
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+
+static inline int get_pref_llc(struct task_struct *p,
+ struct mm_struct *mm)
+{
+ return -1;
+}
+
+static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
+
+#endif /* CONFIG_SCHED_CACHE */
+
/*
* Used by other classes to account runtime.
*/
@@ -3038,6 +3616,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ long __maybe_unused new_fp;
struct numa_group *ng;
/*
@@ -3112,6 +3691,31 @@ static void task_numa_placement(struct task_struct *p)
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
+#ifdef CONFIG_SCHED_CACHE
+ /*
+ * Per task p->numa_faults[mem_idx] converges,
+ * so the accumulation of each task's faults
+ * converges too - Given the number of threads,
+ * it cannot overflow an unsigned long.
+ * Racy with concurrent updates from other threads
+ * sharing this mm. Acceptable since footprint is a
+ * heuristic and occasional lost updates are tolerable.
+ *
+ * If a task exits, its corresponding footprint must
+ * be subtracted from the mm->sc_stat.footprint, otherwise
+ * the mm->sc_stat.footprint will not converge:
+ * the exiting thread's footprint remains unchanged/undecayed
+ * in mm->sc_stat.footprint. See exit_mm().
+ *
+ * Lost updates and unsynchronized subtraction
+ * in exit_mm() can cause footprint + diff to
+ * go negative. Clamp to zero to prevent the
+ * unsigned footprint from wrapping.
+ */
+ new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff;
+ WRITE_ONCE(p->mm->sc_stat.footprint,
+ max(new_fp, 0L));
+#endif
}
if (!ng) {
@@ -3836,9 +4440,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
struct rq *rq = rq_of(cfs_rq);
- account_numa_enqueue(rq, task_of(se));
+ account_numa_enqueue(rq, p);
+ account_llc_enqueue(rq, p);
list_add(&se->group_node, &rq->cfs_tasks);
}
cfs_rq->nr_queued++;
@@ -3849,7 +4455,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
- account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ struct task_struct *p = task_of(se);
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_dequeue(rq, p);
+ account_llc_dequeue(rq, p);
list_del_init(&se->group_node);
}
cfs_rq->nr_queued--;
@@ -9617,6 +10227,16 @@ enum group_type {
*/
group_imbalanced,
/*
+ * There are tasks running on non-preferred LLC, possible to move
+ * them to their preferred LLC without creating too much imbalance.
+ * The priority of group_llc_balance is lower than that of
+ * group_overloaded and higher than that of all other group types.
+ * This is because group_llc_balance may exacerbate load imbalance.
+ * If the LLC balancing attempt fails, the nr_balance_failed
+ * mechanism will trigger other group types to rebalance the load.
+ */
+ group_llc_balance,
+ /*
* The CPU is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
@@ -9627,7 +10247,8 @@ enum migration_type {
migrate_load = 0,
migrate_util,
migrate_task,
- migrate_misfit
+ migrate_misfit,
+ migrate_llc_task
};
#define LBF_ALL_PINNED 0x01
@@ -9635,6 +10256,7 @@ enum migration_type {
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_ACTIVE_LB 0x10
+#define LBF_LLC_PINNED 0x20
struct lb_env {
struct sched_domain *sd;
@@ -9792,6 +10414,298 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
return 0;
}
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * The margin used when comparing LLC utilization with CPU capacity.
+ * It determines the LLC load level where active LLC aggregation is
+ * done.
+ * Derived from fits_capacity().
+ *
+ * (default: ~50%, tunable via debugfs)
+ */
+static bool fits_llc_capacity(unsigned long util, unsigned long max)
+{
+ u32 aggr_pct = llc_overaggr_pct;
+
+ /*
+ * For single core systems, raise the aggregation
+ * threshold to accommodate more tasks.
+ */
+ if (cpu_smt_num_threads == 1)
+ aggr_pct = (aggr_pct * 3 / 2);
+
+ return util * 100 < max * aggr_pct;
+}
+
+/*
+ * The margin used when comparing utilization.
+ * is 'util1' noticeably greater than 'util2'
+ * Derived from capacity_greater().
+ * Bias is in perentage.
+ */
+/* Allows dst util to be bigger than src util by up to bias percent */
+#define util_greater(util1, util2) \
+ ((util1) * 100 > (util2) * (100 + llc_imb_pct))
+
+static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ unsigned long *cap)
+{
+ struct sched_domain_shared *sd_share;
+
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ if (!sd_share)
+ return false;
+
+ *util = READ_ONCE(sd_share->util_avg);
+ *cap = READ_ONCE(sd_share->capacity);
+
+ return true;
+}
+
+/*
+ * Decision matrix according to the LLC utilization. To
+ * decide whether we can do task aggregation across LLC.
+ *
+ * By default, 50% is the threshold for treating the LLC
+ * as busy. The reason for choosing 50% is to avoid saturation
+ * of SMT-2, and it is also a safe cutoff for other SMT-n
+ * platforms. SMT-1 has higher threshold because it is
+ * supposed to accommodate more tasks, see fits_llc_capacity().
+ *
+ * 20% is the utilization imbalance percentage to decide
+ * if the preferred LLC is busier than the non-preferred LLC.
+ * 20 is a little higher than the LLC domain's imbalance_pct
+ * 17. The hysteresis is used to avoid task bouncing between the
+ * preferred LLC and the non-preferred LLC, and it will
+ * be turned into tunable debugfs.
+ *
+ * 1. moving towards the preferred LLC, dst is the preferred
+ * LLC, src is not.
+ *
+ * src \ dst 30% 40% 50% 60%
+ * 30% Y Y Y N
+ * 40% Y Y Y Y
+ * 50% Y Y G G
+ * 60% Y Y G G
+ *
+ * 2. moving out of the preferred LLC, src is the preferred
+ * LLC, dst is not:
+ *
+ * src \ dst 30% 40% 50% 60%
+ * 30% N N N N
+ * 40% N N N N
+ * 50% N N G G
+ * 60% Y N G G
+ *
+ * src : src_util
+ * dst : dst_util
+ * Y : Yes, migrate
+ * N : No, do not migrate
+ * G : let the Generic load balance to even the load.
+ *
+ * The intention is that if both LLCs are quite busy, cache aware
+ * load balance should not be performed, and generic load balance
+ * should take effect. However, if one is busy and the other is not,
+ * the preferred LLC capacity(50%) and imbalance criteria(20%) should
+ * be considered to determine whether LLC aggregation should be
+ * performed to bias the load towards the preferred LLC.
+ */
+
+/* migration decision, 3 states are orthogonal. */
+enum llc_mig {
+ mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */
+ mig_llc, /* Y: Do LLC preference based migration */
+ mig_unrestricted /* G: Don't restrict generic load balance migration */
+};
+
+/*
+ * Check if task can be moved from the source LLC to the
+ * destination LLC without breaking cache aware preferrence.
+ * src_cpu and dst_cpu are arbitrary CPUs within the source
+ * and destination LLCs, respectively.
+ */
+static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
+ unsigned long tsk_util,
+ bool to_pref)
+{
+ unsigned long src_util, dst_util, src_cap, dst_cap;
+
+ if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
+ !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
+ return mig_unrestricted;
+
+ src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
+ dst_util = dst_util + tsk_util;
+
+ if (!fits_llc_capacity(dst_util, dst_cap) &&
+ !fits_llc_capacity(src_util, src_cap))
+ return mig_unrestricted;
+
+ if (to_pref) {
+ /*
+ * Don't migrate if we will get preferred LLC too
+ * heavily loaded and if the dest is much busier
+ * than the src, in which case migration will
+ * increase the imbalance too much.
+ */
+ if (!fits_llc_capacity(dst_util, dst_cap) &&
+ util_greater(dst_util, src_util))
+ return mig_forbid;
+ } else {
+ /*
+ * Don't migrate if we will leave preferred LLC
+ * too idle, or if this migration leads to the
+ * non-preferred LLC falls within sysctl_aggr_imb percent
+ * of preferred LLC, leading to migration again
+ * back to preferred LLC.
+ */
+ if (fits_llc_capacity(src_util, src_cap) ||
+ !util_greater(src_util, dst_util))
+ return mig_forbid;
+ }
+ return mig_llc;
+}
+
+/*
+ * Check if task p can migrate from source LLC to
+ * destination LLC in terms of cache aware load balance.
+ */
+static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ bool to_pref;
+ int cpu;
+
+ mm = p->mm;
+ if (!mm)
+ return mig_unrestricted;
+
+ cpu = READ_ONCE(mm->sc_stat.cpu);
+ if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ return mig_unrestricted;
+
+ /* skip cache aware load balance for too many threads */
+ if (invalid_llc_nr(mm, p, dst_cpu) ||
+ exceed_llc_capacity(mm, dst_cpu)) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+ return mig_unrestricted;
+ }
+
+ if (cpus_share_cache(dst_cpu, cpu))
+ to_pref = true;
+ else if (cpus_share_cache(src_cpu, cpu))
+ to_pref = false;
+ else
+ return mig_unrestricted;
+
+ return can_migrate_llc(src_cpu, dst_cpu,
+ task_util(p), to_pref);
+}
+
+/*
+ * Check if active load balance breaks LLC locality in
+ * terms of cache aware load balance. The load level and
+ * imbalance do not warrant breaking LLC preference per
+ * the can_migrate_llc() policy. Here, the benefit of
+ * LLC locality outweighs the power efficiency gained from
+ * migrating the only runnable task away.
+ */
+static inline bool
+alb_break_llc(struct lb_env *env)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (cpus_share_cache(env->src_cpu, env->dst_cpu))
+ return false;
+ /*
+ * All tasks prefer to stay on their current CPU.
+ * Do not pull a task from its preferred CPU if:
+ * 1. It is the only task running and does not exceed
+ * imbalance allowance; OR
+ * 2. Migrating it away from its preferred LLC would violate
+ * the cache-aware scheduling policy.
+ */
+ if (env->src_rq->nr_pref_llc_running &&
+ env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) {
+ unsigned long util = 0;
+ struct task_struct *cur;
+
+ if (env->src_rq->nr_running <= 1)
+ return true;
+
+ cur = rcu_dereference_all(env->src_rq->curr);
+ if (cur && cur->sched_class == &fair_sched_class)
+ util = task_util(cur);
+
+ if (can_migrate_llc(env->src_cpu, env->dst_cpu,
+ util, false) == mig_forbid)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check if migrating task p from env->src_cpu to
+ * env->dst_cpu breaks LLC localiy.
+ */
+static bool migrate_degrades_llc(struct task_struct *p, struct lb_env *env)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (task_has_sched_core(p))
+ return false;
+ /*
+ * Skip over tasks that would degrade LLC locality;
+ * only when nr_balanced_failed is sufficiently high do we
+ * ignore this constraint.
+ *
+ * Threshold of cache_nice_tries is set to 1 higher
+ * than nr_balance_failed to avoid excessive task
+ * migration at the same time.
+ */
+ if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1)
+ return false;
+
+ /*
+ * We know the env->src_cpu has some tasks prefer to
+ * run on env->dst_cpu, skip the tasks do not prefer
+ * env->dst_cpu, and find the one that prefers.
+ */
+ if (env->migration_type == migrate_llc_task &&
+ READ_ONCE(p->preferred_llc) != llc_id(env->dst_cpu))
+ return true;
+
+ if (can_migrate_llc_task(env->src_cpu,
+ env->dst_cpu, p) != mig_forbid)
+ return false;
+
+ return true;
+}
+
+#else
+static inline bool get_llc_stats(int cpu, unsigned long *util,
+ unsigned long *cap)
+{
+ return false;
+}
+
+static inline bool
+alb_break_llc(struct lb_env *env)
+{
+ return false;
+}
+
+static inline bool
+migrate_degrades_llc(struct task_struct *p, struct lb_env *env)
+{
+ return false;
+}
+#endif
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
@@ -9888,10 +10802,29 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 1;
degrades = migrate_degrades_locality(p, env);
- if (!degrades)
+ if (!degrades) {
+ /*
+ * If the NUMA locality is not broken,
+ * further check if migration would hurt
+ * LLC locality.
+ */
+ if (migrate_degrades_llc(p, env)) {
+ /*
+ * If regular load balancing fails to pull a task
+ * due to LLC locality, this is expected behavior
+ * and we set LBF_LLC_PINNED so we don't increase
+ * nr_balance_failed unecessarily.
+ */
+ if (env->migration_type != migrate_llc_task)
+ env->flags |= LBF_LLC_PINNED;
+
+ return 0;
+ }
+
hot = task_hot(p, env);
- else
+ } else {
hot = degrades > 0;
+ }
if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (hot)
@@ -10053,6 +10986,10 @@ static int detach_tasks(struct lb_env *env)
env->imbalance = 0;
break;
+
+ case migrate_llc_task:
+ env->imbalance--;
+ break;
}
detach_task(p, env);
@@ -10331,12 +11268,16 @@ struct sg_lb_stats {
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ unsigned int nr_pref_dst_llc;
+#endif
};
/*
@@ -10594,6 +11535,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
+ if (sgs->group_llc_balance)
+ return group_llc_balance;
+
if (sg_imbalanced(group))
return group_imbalanced;
@@ -10748,6 +11692,105 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
return check_cpu_capacity(rq, sd);
}
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * Record the statistics for this scheduler group for later
+ * use. These values guide load balancing on aggregating tasks
+ * to a LLC.
+ */
+static void record_sg_llc_stats(struct lb_env *env,
+ struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ struct sched_domain_shared *sd_share;
+ int cpu;
+
+ if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ /* Only care about sched domain spanning multiple LLCs */
+ if (env->sd->child != rcu_dereference_all(per_cpu(sd_llc, env->dst_cpu)))
+ return;
+
+ /*
+ * At this point we know this group spans a LLC domain.
+ * Record the statistic of this group in its corresponding
+ * shared LLC domain.
+ * Note: sd_share cannot be obtained via sd->child->shared,
+ * because the latter refers to the domain that covers the
+ * local group. Instead, sd_share should be located using
+ * the first CPU of the LLC group.
+ */
+ cpu = cpumask_first(sched_group_span(group));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ if (!sd_share)
+ return;
+
+ if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
+ WRITE_ONCE(sd_share->util_avg, sgs->group_util);
+
+ if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
+ WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
+}
+
+/*
+ * Do LLC balance on sched group that contains LLC, and have tasks preferring
+ * to run on LLC in idle dst_cpu.
+ */
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (env->sd->flags & SD_SHARE_LLC)
+ return false;
+
+ /*
+ * Skip cache aware tagging if nr_balanced_failed is sufficiently high.
+ * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed
+ * to avoid excessive task migration at the same time.
+ */
+ if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1)
+ return false;
+
+ if (sgs->nr_pref_dst_llc &&
+ can_migrate_llc(cpumask_first(sched_group_span(group)),
+ env->dst_cpu, 0, true) == mig_llc)
+ return true;
+
+ return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *sgs)
+{
+ /*
+ * There are more tasks that want to run on dst_cpu's LLC.
+ */
+ return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc;
+}
+#else
+static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+}
+
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *sgs)
+{
+ return false;
+}
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -10784,6 +11827,20 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (cpu_overutilized(i))
sgs->group_overutilized = 1;
+#ifdef CONFIG_SCHED_CACHE
+ if (sched_cache_enabled()) {
+ struct sched_domain *sd_tmp;
+ int dst_llc;
+
+ dst_llc = llc_id(env->dst_cpu);
+ if (llc_id(i) != dst_llc) {
+ sd_tmp = rcu_dereference_all(rq->sd);
+ if (sd_tmp && (unsigned int)dst_llc < sd_tmp->llc_max)
+ sgs->nr_pref_dst_llc += sd_tmp->llc_counts[dst_llc];
+ }
+ }
+#endif
+
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -10824,17 +11881,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
- /* Check if dst CPU is idle and preferred to this group */
- if (!local_group && env->idle && sgs->sum_h_nr_running &&
- sched_group_asym(env, sgs, group))
- sgs->group_asym_packing = 1;
+ if (!local_group) {
+ /* Check if dst CPU is idle and preferred to this group */
+ if (env->idle && sgs->sum_h_nr_running &&
+ sched_group_asym(env, sgs, group))
+ sgs->group_asym_packing = 1;
+
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
- /* Check for loaded SMT group to be balanced to dst CPU */
- if (!local_group && smt_balance(env, sgs, group))
- sgs->group_smt_balance = 1;
+ /* Check for tasks in this group can be moved to their preferred LLC */
+ if (llc_balance(env, sgs, group))
+ sgs->group_llc_balance = 1;
+ }
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ record_sg_llc_stats(env, sgs, group);
/* Computing avg_load makes sense only when group is overloaded */
if (sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
@@ -10899,6 +11963,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
/* Select the overloaded group with highest avg_load. */
return sgs->avg_load > busiest->avg_load;
+ case group_llc_balance:
+ /* Select the group with most tasks preferring dst LLC */
+ return update_llc_busiest(env, busiest, sgs);
+
case group_imbalanced:
/*
* Select the 1st imbalanced group as we don't have any way to
@@ -11161,6 +12229,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
return false;
break;
+ case group_llc_balance:
case group_imbalanced:
case group_asym_packing:
case group_smt_balance:
@@ -11293,6 +12362,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
return NULL;
break;
+ case group_llc_balance:
case group_imbalanced:
case group_asym_packing:
case group_smt_balance:
@@ -11547,6 +12617,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+#ifdef CONFIG_SCHED_CACHE
+ if (busiest->group_type == group_llc_balance) {
+ /* Move a task that prefer local LLC */
+ env->migration_type = migrate_llc_task;
+ env->imbalance = 1;
+ return;
+ }
+#endif
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -11793,7 +12872,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
* group's child domain.
*/
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- sibling_imbalance(env, &sds, busiest, local) > 1)
+ (busiest->group_type == group_llc_balance ||
+ sibling_imbalance(env, &sds, busiest, local) > 1))
goto force_balance;
if (busiest->group_type != group_overloaded) {
@@ -11852,7 +12932,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ unsigned int __maybe_unused busiest_pref_llc = 0;
+ struct sched_domain __maybe_unused *sd_tmp;
unsigned int busiest_nr = 0;
+ int __maybe_unused dst_llc;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
@@ -11980,6 +13063,23 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
break;
+ case migrate_llc_task:
+#ifdef CONFIG_SCHED_CACHE
+ sd_tmp = rcu_dereference_all(rq->sd);
+ dst_llc = llc_id(env->dst_cpu);
+
+ if (sd_tmp && (unsigned)dst_llc < sd_tmp->llc_max) {
+ unsigned int this_pref_llc =
+ sd_tmp->llc_counts[dst_llc];
+
+ if (busiest_pref_llc < this_pref_llc) {
+ busiest_pref_llc = this_pref_llc;
+ busiest = rq;
+ }
+ }
+#endif
+ break;
+
}
}
@@ -12031,6 +13131,9 @@ static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
+ if (alb_break_llc(env))
+ return 0;
+
if (asym_active_balance(env))
return 1;
@@ -12050,7 +13153,8 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
- if (env->migration_type == migrate_misfit)
+ if (env->migration_type == migrate_misfit ||
+ env->migration_type == migrate_llc_task)
return 1;
return 0;
@@ -12143,6 +13247,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
case migrate_misfit:
__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
break;
+ case migrate_llc_task:
+ break;
}
}
@@ -12346,9 +13452,16 @@ more_balance:
*
* Similarly for migration_misfit which is not related to
* load/util migration, don't pollute nr_balance_failed.
+ *
+ * The same for cache aware scheduling's allowance for
+ * load imbalance. If regular load balance does not
+ * migrate task due to LLC locality, it is a expected
+ * behavior and don't pollute nr_balance_failed.
+ * See can_migrate_task().
*/
if (idle != CPU_NEWLY_IDLE &&
- env.migration_type != migrate_misfit)
+ env.migration_type != migrate_misfit &&
+ !(env.flags & LBF_LLC_PINNED))
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@@ -13756,6 +14869,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+ task_tick_cache(rq, curr);
+
update_misfit_status(curr, rq);
check_update_overutilized_status(task_rq(curr));