diff options
| author | Peter Zijlstra <peterz@infradead.org> | 2026-05-19 12:18:01 +0200 |
|---|---|---|
| committer | Peter Zijlstra <peterz@infradead.org> | 2026-05-19 12:18:01 +0200 |
| commit | a26d9208c1376ac3877d9f12e697f83368e2af1c (patch) | |
| tree | 1f68b9ae26b8b88a7d1f24fea2a6df87e4c0fee5 | |
| parent | dd29c017aed628076e915fe4cdfb5392fd4c5cab (diff) | |
| parent | c99b8593b060931c5a0a4b701689f8d6a2c00dbf (diff) | |
| download | linux-next-a26d9208c1376ac3877d9f12e697f83368e2af1c.tar.gz linux-next-a26d9208c1376ac3877d9f12e697f83368e2af1c.zip | |
Merge branch 'sched/cache'
Merge the cache aware balancer topic branch.
# Conflicts:
# kernel/sched/topology.c
| -rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 12 | ||||
| -rw-r--r-- | arch/x86/include/asm/processor.h | 5 | ||||
| -rw-r--r-- | arch/x86/kernel/smpboot.c | 20 | ||||
| -rw-r--r-- | drivers/base/cacheinfo.c | 23 | ||||
| -rw-r--r-- | include/linux/cacheinfo.h | 1 | ||||
| -rw-r--r-- | include/linux/mm_types.h | 32 | ||||
| -rw-r--r-- | include/linux/sched.h | 30 | ||||
| -rw-r--r-- | include/linux/sched/topology.h | 16 | ||||
| -rw-r--r-- | init/Kconfig | 11 | ||||
| -rw-r--r-- | init/init_task.c | 4 | ||||
| -rw-r--r-- | kernel/exit.c | 29 | ||||
| -rw-r--r-- | kernel/fork.c | 6 | ||||
| -rw-r--r-- | kernel/sched/core.c | 13 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 60 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 1145 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 51 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 447 |
17 files changed, 1849 insertions, 56 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4d0f545fb3ec..f575d450861e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7236,6 +7236,18 @@ Kernel parameters Not specifying this option is equivalent to spec_store_bypass_disable=auto. + split_llc= + [X86,EARLY] Split the LLC N-ways + + When set, the LLC is split this many ways by matching + 'core_id % n'. This is setup before SMP bringup and + used during SMP bringup before it knows the full + topology. If your core count doesn't nicely divide by + the number given, you get to keep the pieces. + + This is mostly a debug feature to emulate multiple LLCs + on hardware that only have a single LLC. + split_lock_detect= [X86] Enable split lock detection or bus lock detection diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 10b5355b323e..eb9fce94620d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -704,6 +704,11 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu) return per_cpu(cpu_info.topo.l2c_id, cpu); } +static inline u32 per_cpu_core_id(unsigned int cpu) +{ + return per_cpu(cpu_info.topo.core_id, cpu); +} + #ifdef CONFIG_CPU_SUP_AMD /* * Issue a DIV 0/1 insn to clear any division data from previous DIV diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 294a8ea60298..cb999feb66b0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -424,6 +424,21 @@ static const struct x86_cpu_id intel_cod_cpu[] = { {} }; +/* + * Allows splitting the LLC by matching 'core_id % split_llc'. + * + * This is mostly a debug hack to emulate systems with multiple LLCs per node + * on systems that do not naturally have this. + */ +static unsigned int split_llc = 0; + +static int __init split_llc_setup(char *str) +{ + get_option(&str, &split_llc); + return 0; +} +early_param("split_llc", split_llc_setup); + static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); @@ -438,6 +453,11 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2)) return false; + if (split_llc && + (per_cpu_core_id(cpu1) % split_llc) != + (per_cpu_core_id(cpu2) % split_llc)) + return false; + /* * Allow the SNC topology without warning. Return of false * means 'c' does not share the LLC of 'o'. This will be diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index 391ac5e3d2f5..70701d3bc81c 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/of.h> #include <linux/sched.h> +#include <linux/sched/topology.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/sysfs.h> @@ -68,6 +69,24 @@ bool last_level_cache_is_valid(unsigned int cpu) } +/* + * Get the cacheinfo of the LLC associated with @cpu. + * Derived from update_per_cpu_data_slice_size_cpu(). + */ +struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu) +{ + struct cacheinfo *llc; + + if (!last_level_cache_is_valid(cpu)) + return NULL; + + llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); + if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED) + return NULL; + + return llc; +} + bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y) { struct cacheinfo *llc_x, *llc_y; @@ -1018,6 +1037,7 @@ static int cacheinfo_cpu_online(unsigned int cpu) goto err; if (cpu_map_shared_cache(true, cpu, &cpu_map)) update_per_cpu_data_slice_size(true, cpu, cpu_map); + sched_update_llc_bytes(cpu); return 0; err: free_cache_attributes(cpu); @@ -1036,6 +1056,9 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu) free_cache_attributes(cpu); if (nr_shared > 1) update_per_cpu_data_slice_size(false, cpu, cpu_map); + + sched_update_llc_bytes(cpu); + return 0; } diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index c8f4f0a0b874..fc879ac4cc4f 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu); int cache_setup_acpi(unsigned int cpu); bool last_level_cache_is_valid(unsigned int cpu); bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y); +struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu); int fetch_cache_info(unsigned int cpu); int detect_cache_attributes(unsigned int cpu); #ifndef CONFIG_ACPI_PPTT diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a308e2c23b82..c7db35be6a30 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1222,6 +1222,8 @@ struct mm_struct { /* MM CID related storage */ struct mm_mm_cid mm_cid; + /* sched_cache related statistics */ + struct sched_cache_stat sc_stat; #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1628,6 +1630,36 @@ static inline unsigned int mm_cid_size(void) # define MM_CID_STATIC_SIZE 0 #endif /* CONFIG_SCHED_MM_CID */ +#ifdef CONFIG_SCHED_CACHE +void mm_init_sched(struct mm_struct *mm, + struct sched_cache_time __percpu *pcpu_sched); + +static inline int mm_alloc_sched_noprof(struct mm_struct *mm) +{ + struct sched_cache_time __percpu *pcpu_sched = + alloc_percpu_noprof(struct sched_cache_time); + + if (!pcpu_sched) + return -ENOMEM; + + mm_init_sched(mm, pcpu_sched); + return 0; +} + +#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) + +static inline void mm_destroy_sched(struct mm_struct *mm) +{ + free_percpu(mm->sc_stat.pcpu_sched); + mm->sc_stat.pcpu_sched = NULL; +} +#else /* !CONFIG_SCHED_CACHE */ + +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } +static inline void mm_destroy_sched(struct mm_struct *mm) { } + +#endif /* CONFIG_SCHED_CACHE */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/sched.h b/include/linux/sched.h index 368c7b4d7cb5..da6a0907a78c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1408,6 +1408,13 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; + int preferred_llc; + /* 1: task was enqueued to its preferred LLC, 0 otherwise */ + int pref_llc_queued; +#endif + struct rseq_data rseq; struct sched_mm_cid mm_cid; @@ -2408,6 +2415,29 @@ static __always_inline int task_mm_cid(struct task_struct *t) } #endif +#ifdef CONFIG_SCHED_CACHE + +struct sched_cache_time { + u64 runtime; + unsigned long epoch; +}; + +struct sched_cache_stat { + struct sched_cache_time __percpu *pcpu_sched; + raw_spinlock_t lock; + unsigned long epoch; + u64 nr_running_avg; + unsigned long next_scan; + unsigned long footprint; + int cpu; +} ____cacheline_aligned_in_smp; + +#else + +struct sched_cache_stat { }; + +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 36553e14866d..fe09d3268bc9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -68,6 +68,10 @@ struct sched_domain_shared { atomic_t nr_busy_cpus; int has_idle_cores; int nr_idle_scan; +#ifdef CONFIG_SCHED_CACHE + unsigned long util_avg; + unsigned long capacity; +#endif }; struct sched_domain { @@ -99,6 +103,12 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; +#ifdef CONFIG_SCHED_CACHE + unsigned int llc_max; + unsigned int *llc_counts __counted_by_ptr(llc_max); + unsigned long llc_bytes; +#endif + #ifdef CONFIG_SCHEDSTATS /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; @@ -256,4 +266,10 @@ static inline int task_node(const struct task_struct *p) return cpu_to_node(task_cpu(p)); } +#ifdef CONFIG_SCHED_CACHE +extern void sched_update_llc_bytes(unsigned int cpu); +#else +static inline void sched_update_llc_bytes(unsigned int cpu) { } +#endif + #endif /* _LINUX_SCHED_TOPOLOGY_H */ diff --git a/init/Kconfig b/init/Kconfig index 2937c4d308ae..a52cd95ef171 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1019,6 +1019,17 @@ config NUMA_BALANCING This system will be inactive on UMA systems. +config SCHED_CACHE + bool "Cache aware load balance" + default y + depends on SMP + help + When enabled, the scheduler will attempt to aggregate tasks from + the same process onto a single Last Level Cache (LLC) domain when + possible. This improves cache locality by keeping tasks that share + resources within the same cache domain, reducing cache misses and + lowering data access latency. + config NUMA_BALANCING_DEFAULT_ENABLED bool "Automatically enable NUMA aware memory/task placement" default y diff --git a/init/init_task.c b/init/init_task.c index b5f48ebdc2b6..3ecd66fbd563 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -215,6 +215,10 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .numa_group = NULL, .numa_faults = NULL, #endif +#ifdef CONFIG_SCHED_CACHE + .preferred_llc = -1, + .pref_llc_queued = 0, +#endif #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) .kasan_depth = 1, #endif diff --git a/kernel/exit.c b/kernel/exit.c index 25e9cb6de7e7..c6e7047739b0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm) } #endif /* CONFIG_MEMCG */ +#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING) +/* + * Subtract the memory footprint of the current task from + * mm. + */ +static void exit_mm_sched_cache(struct mm_struct *mm) +{ + unsigned long fp, sub; + + if (!current->total_numa_faults) + return; + /* + * No lock protection due to performance considerations. + * Make sure mm->sc_stat.footprint does not become + * negative. + */ + fp = READ_ONCE(mm->sc_stat.footprint); + sub = min(fp, current->total_numa_faults); + WRITE_ONCE(mm->sc_stat.footprint, fp - sub); +} +#else +static inline void exit_mm_sched_cache(struct mm_struct *mm) +{ +} +#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */ + /* * Turn us into a lazy TLB process if we * aren't already.. @@ -554,6 +580,9 @@ static void exit_mm(void) exit_mm_release(current, mm); if (!mm) return; + + exit_mm_sched_cache(mm); + mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); diff --git a/kernel/fork.c b/kernel/fork.c index 5f3fdfdb14c7..a679b2448234 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -726,6 +726,7 @@ void __mmdrop(struct mm_struct *mm) cleanup_lazy_tlbs(mm); WARN_ON_ONCE(mm == current->active_mm); + mm_destroy_sched(mm); mm_free_pgd(mm); mm_free_id(mm); destroy_context(mm); @@ -1128,6 +1129,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm, p)) goto fail_cid; + if (mm_alloc_sched(mm)) + goto fail_sched; + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS)) goto fail_pcpu; @@ -1137,6 +1141,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, return mm; fail_pcpu: + mm_destroy_sched(mm); +fail_sched: mm_destroy_cid(mm); fail_cid: destroy_context(mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 77f4ebe8f5c7..7fb3f5f2d48c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -548,6 +548,11 @@ void __trace_set_current_state(int state_value) } EXPORT_SYMBOL(__trace_set_current_state); +int task_llc(const struct task_struct *p) +{ + return per_cpu(sd_llc_id, task_cpu(p)); +} + /* * Serialization rules: * @@ -4506,6 +4511,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; + init_sched_mm(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -8699,6 +8705,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); + sched_domains_free_llc_id(cpu); + sched_set_rq_offline(rq, cpu); scx_rq_deactivate(rq); @@ -9030,6 +9038,11 @@ void __init sched_init(void) rq->core_cookie = 0UL; #endif +#ifdef CONFIG_SCHED_CACHE + raw_spin_lock_init(&rq->cpu_epoch_lock); + rq->cpu_epoch_next = jiffies; +#endif + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 74c1617cf652..ed3a0d65da0c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -210,6 +210,48 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CACHE +static ssize_t +sched_cache_enable_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + bool val; + int ret; + + ret = kstrtobool_from_user(ubuf, cnt, &val); + if (ret) + return ret; + + sysctl_sched_cache_user = val; + + sched_cache_active_set(); + + *ppos += cnt; + + return cnt; +} + +static int sched_cache_enable_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", sysctl_sched_cache_user); + return 0; +} + +static int sched_cache_enable_open(struct inode *inode, + struct file *filp) +{ + return single_open(filp, sched_cache_enable_show, NULL); +} + +static const struct file_operations sched_cache_enable_fops = { + .open = sched_cache_enable_open, + .write = sched_cache_enable_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -593,7 +635,7 @@ static void debugfs_ext_server_init(void) static __init int sched_init_debug(void) { - struct dentry __maybe_unused *numa; + struct dentry __maybe_unused *numa, *llc; debugfs_sched = debugfs_create_dir("sched", NULL); @@ -626,6 +668,22 @@ static __init int sched_init_debug(void) debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_CACHE + llc = debugfs_create_dir("llc_balancing", debugfs_sched); + debugfs_create_file("enabled", 0644, llc, NULL, + &sched_cache_enable_fops); + debugfs_create_u32("aggr_tolerance", 0644, llc, + &llc_aggr_tolerance); + debugfs_create_u32("epoch_period", 0644, llc, + &llc_epoch_period); + debugfs_create_u32("epoch_affinity_timeout", 0644, llc, + &llc_epoch_affinity_timeout); + debugfs_create_u32("overaggr_pct", 0644, llc, + &llc_overaggr_pct); + debugfs_create_u32("imb_pct", 0644, llc, + &llc_imb_pct); +#endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); debugfs_fair_server_init(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69ba882681c5..8e858ca6bcd0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1321,6 +1321,8 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } +static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec); + static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); @@ -1343,6 +1345,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) trace_sched_stat_runtime(running, delta_exec); account_group_exec_runtime(running, delta_exec); + account_mm_sched(rq, running, delta_exec); /* cgroup time is always accounted against the donor */ cgroup_account_cputime(donor, delta_exec); @@ -1364,6 +1367,581 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) static void set_next_buddy(struct sched_entity *se); +#ifdef CONFIG_SCHED_CACHE + +/* + * XXX numbers come from a place the sun don't shine -- probably wants to be SD + * tunable or so. + */ +#define EPOCH_PERIOD (HZ / 100) /* 10 ms */ +#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ +__read_mostly unsigned int llc_aggr_tolerance = 1; +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD; +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT; +__read_mostly unsigned int llc_imb_pct = 20; +__read_mostly unsigned int llc_overaggr_pct = 50; + +static int llc_id(int cpu) +{ + if (cpu < 0) + return -1; + + return per_cpu(sd_llc_id, cpu); +} + +static inline int get_sched_cache_scale(int mul) +{ + unsigned int tol = READ_ONCE(llc_aggr_tolerance); + + if (!tol) + return 0; + + if (tol >= 100) + return INT_MAX; + + return (1 + (tol - 1) * mul); +} + +static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) +{ +#ifdef CONFIG_NUMA_BALANCING + unsigned long llc, footprint; + struct sched_domain *sd; + int scale; + + guard(rcu)(); + + sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); + if (!sd) + return true; + + if (static_branch_likely(&sched_numa_balancing)) { + /* + * TBD: RDT exclusive LLC ways reserved should be + * excluded. + */ + llc = sd->llc_bytes; + footprint = READ_ONCE(mm->sc_stat.footprint); + + /* + * Scale the LLC size by 256*llc_aggr_tolerance + * and compare it to the task's footprint. + * + * Suppose the L3 size is 32MB. If the + * llc_aggr_tolerance is 1: + * When the footprint is larger than 32MB, the + * process is regarded as exceeding the LLC + * capacity. If the llc_aggr_tolerance is 99: + * When the footprint is larger than 784GB, the + * process is regarded as exceeding the LLC + * capacity: + * 784GB = (1 + (99 - 1) * 256) * 32MB + * If the llc_aggr_tolerance is 100: + * ignore the footprint and do the aggregation + * anyway. + */ + scale = get_sched_cache_scale(256); + if (scale == INT_MAX) + return false; + + return ((llc * (u64)scale) < (footprint * PAGE_SIZE)); + } +#endif + return false; +} + +static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, + int cpu) +{ + int scale; + + if (get_nr_threads(p) <= 1) + return true; + + /* + * Scale the number of 'cores' in a LLC by llc_aggr_tolerance + * and compare it to the task's active threads. + */ + scale = get_sched_cache_scale(1); + if (scale == INT_MAX) + return false; + + return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads), + (scale * per_cpu(sd_llc_size, cpu))); +} + +static void account_llc_enqueue(struct rq *rq, struct task_struct *p) +{ + int pref_llc, pref_llc_queued; + struct sched_domain *sd; + + pref_llc = p->preferred_llc; + if (pref_llc < 0) + return; + + pref_llc_queued = (pref_llc == task_llc(p)); + rq->nr_llc_running++; + rq->nr_pref_llc_running += pref_llc_queued; + + /* + * Record whether p is enqueued on its preferred + * LLC, in order to pair with account_llc_dequeue() + * to maintain a consistent nr_pref_llc_running per + * runqueue. + * This is necessary because a race condition exists: + * after a task is enqueued on a runqueue, task_llc(p) + * may change due to CPU hotplug. Therefore, checking + * task_llc(p) to determine whether the task is being + * dequeued from its preferred LLC is unreliable and + * can cause inconsistent values - checking the + * p->pref_llc_queued in account_llc_dequeue() would + * be reliable. + */ + p->pref_llc_queued = pref_llc_queued; + + sd = rcu_dereference_all(rq->sd); + if (sd && (unsigned int)pref_llc < sd->llc_max) + sd->llc_counts[pref_llc]++; +} + +static void account_llc_dequeue(struct rq *rq, struct task_struct *p) +{ + struct sched_domain *sd; + int pref_llc; + + pref_llc = p->preferred_llc; + if (pref_llc < 0) + return; + + rq->nr_llc_running--; + if (p->pref_llc_queued) { + rq->nr_pref_llc_running--; + /* + * Update the status in case + * other logic might query + * this. + */ + p->pref_llc_queued = 0; + } + + sd = rcu_dereference_all(rq->sd); + if (sd && (unsigned int)pref_llc < sd->llc_max) { + /* + * There is a race condition between dequeue + * and CPU hotplug. After a task has been enqueued + * on CPUx, a CPU hotplug event occurs, and all online + * CPUs (including CPUx) rebuild their sched_domains + * and reset statistics to zero(including sd->llc_counts). + * This can cause temporary undercount and we have to + * check for such underflow in sd->llc_counts. + * + * This undercount is temporary and accurate accounting + * will resume once the rq has a chance to be idle. + */ + if (sd->llc_counts[pref_llc]) + sd->llc_counts[pref_llc]--; + } +} + +void mm_init_sched(struct mm_struct *mm, + struct sched_cache_time __percpu *_pcpu_sched) +{ + unsigned long epoch = 0; + int i; + + for_each_possible_cpu(i) { + struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); + struct rq *rq = cpu_rq(i); + + pcpu_sched->runtime = 0; + /* a slightly stale cpu epoch is acceptible */ + pcpu_sched->epoch = rq->cpu_epoch; + epoch = rq->cpu_epoch; + } + + raw_spin_lock_init(&mm->sc_stat.lock); + mm->sc_stat.epoch = epoch; + mm->sc_stat.cpu = -1; + mm->sc_stat.next_scan = jiffies; + mm->sc_stat.nr_running_avg = 0; + mm->sc_stat.footprint = 0; + /* + * The update to mm->sc_stat should not be reordered + * before initialization to mm's other fields, in case + * the readers may get invalid mm_sched_epoch, etc. + */ + smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched); +} + +/* because why would C be fully specified */ +static __always_inline void __shr_u64(u64 *val, unsigned int n) +{ + if (n >= 64) { + *val = 0; + return; + } + *val >>= n; +} + +static inline void __update_mm_sched(struct rq *rq, + struct sched_cache_time *pcpu_sched) +{ + lockdep_assert_held(&rq->cpu_epoch_lock); + + unsigned int period = max(READ_ONCE(llc_epoch_period), 1U); + unsigned long n, now = jiffies; + long delta = now - rq->cpu_epoch_next; + + if (delta > 0) { + n = (delta + period - 1) / period; + rq->cpu_epoch += n; + rq->cpu_epoch_next += n * period; + __shr_u64(&rq->cpu_runtime, n); + } + + n = rq->cpu_epoch - pcpu_sched->epoch; + if (n) { + pcpu_sched->epoch += n; + __shr_u64(&pcpu_sched->runtime, n); + } +} + +static unsigned long fraction_mm_sched(struct rq *rq, + struct sched_cache_time *pcpu_sched) +{ + guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); + + __update_mm_sched(rq, pcpu_sched); + + /* + * Runtime is a geometric series (r=0.5) and as such will sum to twice + * the accumulation period, this means the multiplcation here should + * not overflow. + */ + return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); +} + +static int get_pref_llc(struct task_struct *p, struct mm_struct *mm) +{ + int mm_sched_llc = -1, mm_sched_cpu; + + if (!mm) + return -1; + + mm_sched_cpu = READ_ONCE(mm->sc_stat.cpu); + if (mm_sched_cpu != -1) { + mm_sched_llc = llc_id(mm_sched_cpu); + +#ifdef CONFIG_NUMA_BALANCING + /* + * Don't assign preferred LLC if it + * conflicts with NUMA balancing. + * This can happen when sched_setnuma() gets + * called, however it is not much of an issue + * because we expect account_mm_sched() to get + * called fairly regularly -- at a higher rate + * than sched_setnuma() at least -- and thus the + * conflict only exists for a short period of time. + */ + if (static_branch_likely(&sched_numa_balancing) && + p->numa_preferred_nid >= 0 && + cpu_to_node(mm_sched_cpu) != p->numa_preferred_nid) + mm_sched_llc = -1; +#endif + } + + return mm_sched_llc; +} + +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p); + +static inline +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) +{ + struct sched_cache_time *pcpu_sched; + struct mm_struct *mm = p->mm; + int mm_sched_llc = -1; + unsigned long epoch; + + if (!sched_cache_enabled()) + return; + + if (p->sched_class != &fair_sched_class) + return; + /* + * init_task, kthreads and user thread created + * by user_mode_thread() don't have mm. + */ + if (!mm || !mm->sc_stat.pcpu_sched) + return; + + pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu_of(rq)); + + scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { + __update_mm_sched(rq, pcpu_sched); + pcpu_sched->runtime += delta_exec; + rq->cpu_runtime += delta_exec; + epoch = rq->cpu_epoch; + } + + /* + * If this process hasn't hit task_cache_work() for a while invalidate + * its preferred state. + */ + if ((long)(epoch - READ_ONCE(mm->sc_stat.epoch)) > llc_epoch_affinity_timeout || + invalid_llc_nr(mm, p, cpu_of(rq)) || + exceed_llc_capacity(mm, cpu_of(rq))) { + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); + } + + mm_sched_llc = get_pref_llc(p, mm); + + /* task not on rq accounted later in account_entity_enqueue() */ + if (task_running_on_cpu(rq->cpu, p) && + READ_ONCE(p->preferred_llc) != mm_sched_llc) { + account_llc_dequeue(rq, p); + WRITE_ONCE(p->preferred_llc, mm_sched_llc); + account_llc_enqueue(rq, p); + } +} + +static void task_tick_cache(struct rq *rq, struct task_struct *p) +{ + struct callback_head *work = &p->cache_work; + struct mm_struct *mm = p->mm; + unsigned long epoch; + + if (!sched_cache_enabled()) + return; + + if (!mm || p->flags & PF_KTHREAD || + !mm->sc_stat.pcpu_sched) + return; + + epoch = rq->cpu_epoch; + /* avoid moving backwards */ + if (time_after_eq(mm->sc_stat.epoch, epoch)) + return; + + guard(raw_spinlock)(&mm->sc_stat.lock); + + if (work->next == work) { + task_work_add(p, work, TWA_RESUME); + WRITE_ONCE(mm->sc_stat.epoch, epoch); + } +} + +static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p) +{ +#ifdef CONFIG_NUMA_BALANCING + int cpu, curr_cpu, nid, pref_nid; + + if (!static_branch_likely(&sched_numa_balancing)) + goto out; + + cpu = READ_ONCE(p->mm->sc_stat.cpu); + if (cpu != -1) + nid = cpu_to_node(cpu); + curr_cpu = task_cpu(p); + + /* + * Scanning in the preferred NUMA node is ideal. However, the NUMA + * preferred node is per-task rather than per-process. It is possible + * for different threads of the process to have distinct preferred + * nodes; consequently, the process-wide preferred LLC may bounce + * between different nodes. As a workaround, maintain the scan + * CPU mask to also cover the process's current preferred LLC and the + * current running node to mitigate the bouncing risk. + * TBD: numa_group should be considered during task aggregation. + */ + pref_nid = p->numa_preferred_nid; + /* honor the task's preferred node */ + if (pref_nid == NUMA_NO_NODE) + goto out; + + cpumask_or(cpus, cpus, cpumask_of_node(pref_nid)); + + /* honor the task's preferred LLC CPU */ + if (cpu != -1 && !cpumask_test_cpu(cpu, cpus) && nid != NUMA_NO_NODE) + cpumask_or(cpus, cpus, cpumask_of_node(nid)); + + /* make sure the task's current running node is included */ + if (!cpumask_test_cpu(curr_cpu, cpus)) + cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu))); + + return; + +out: +#endif + cpumask_copy(cpus, cpu_online_mask); +} + +static inline void update_avg_scale(u64 *avg, u64 sample) +{ + int factor = per_cpu(sd_llc_size, raw_smp_processor_id()); + s64 diff = sample - *avg; + u32 divisor; + + /* + * Scale the divisor based on the number of CPUs contained + * in the LLC. This scaling ensures smaller LLC domains use + * a smaller divisor to achieve more precise sensitivity to + * changes in nr_running, while larger LLC domains are capped + * at a maximum divisor of 8 which is the default smoothing + * factor of EWMA in update_avg(). + */ + divisor = clamp_t(u32, (factor >> 2), 2, 8); + *avg += div64_s64(diff, divisor); +} + +static void task_cache_work(struct callback_head *work) +{ + int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu; + unsigned long next_scan, now = jiffies; + struct task_struct *p = current, *cur; + unsigned long curr_m_a_occ = 0; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); + + work->next = work; + + if (p->flags & PF_EXITING) + return; + + next_scan = READ_ONCE(mm->sc_stat.next_scan); + if (time_before(now, next_scan)) + return; + + /* only 1 thread is allowed to scan */ + if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan, + now + max_t(unsigned long, + READ_ONCE(llc_epoch_period), 1))) + return; + + curr_cpu = task_cpu(p); + if (invalid_llc_nr(mm, p, curr_cpu) || + exceed_llc_capacity(mm, curr_cpu)) { + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); + + return; + } + + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + + scoped_guard (cpus_read_lock) { + guard(rcu)(); + + get_scan_cpumasks(cpus, p); + + for_each_cpu(cpu, cpus) { + /* XXX sched_cluster_active */ + struct sched_domain *sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); + unsigned long occ, m_occ = 0, a_occ = 0; + int m_cpu = -1, i; + + if (!sd) + continue; + + for_each_cpu(i, sched_domain_span(sd)) { + occ = fraction_mm_sched(cpu_rq(i), + per_cpu_ptr(mm->sc_stat.pcpu_sched, i)); + a_occ += occ; + if (occ > m_occ) { + m_occ = occ; + m_cpu = i; + } + + cur = rcu_dereference_all(cpu_rq(i)->curr); + if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) && + cur->mm == mm) + nr_running++; + } + + /* + * Compare the accumulated occupancy of each LLC. The + * reason for using accumulated occupancy rather than average + * per CPU occupancy is that it works better in asymmetric LLC + * scenarios. + * For example, if there are 2 threads in a 4CPU LLC and 3 + * threads in an 8CPU LLC, it might be better to choose the one + * with 3 threads. However, this would not be the case if the + * occupancy is divided by the number of CPUs in an LLC (i.e., + * if average per CPU occupancy is used). + * Besides, NUMA balancing fault statistics behave similarly: + * the total number of faults per node is compared rather than + * the average number of faults per CPU. This strategy is also + * followed here. + */ + if (a_occ > m_a_occ) { + m_a_occ = a_occ; + m_a_cpu = m_cpu; + } + + if (llc_id(cpu) == llc_id(READ_ONCE(mm->sc_stat.cpu))) + curr_m_a_occ = a_occ; + + cpumask_andnot(cpus, cpus, sched_domain_span(sd)); + } + } + + if (m_a_occ > (2 * curr_m_a_occ)) { + /* + * Avoid switching sc_stat.cpu too fast. + * The reason to choose 2X is because: + * 1. It is better to keep the preferred LLC stable, + * rather than changing it frequently and cause migrations + * 2. 2X means the new preferred LLC has at least 1 more + * busy CPU than the old one(200% vs 100%, eg) + * 3. 2X is chosen based on test results, as it delivers + * the optimal performance gain so far. + */ + WRITE_ONCE(mm->sc_stat.cpu, m_a_cpu); + } + + update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running); + free_cpumask_var(cpus); +} + +void init_sched_mm(struct task_struct *p) +{ + struct callback_head *work = &p->cache_work; + + init_task_work(work, task_cache_work); + work->next = work; + /* + * Reset new task's preference to avoid + * polluting account_llc_enqueue(). + */ + p->preferred_llc = -1; +} + +#else /* CONFIG_SCHED_CACHE */ + +static inline void account_mm_sched(struct rq *rq, struct task_struct *p, + s64 delta_exec) { } + +void init_sched_mm(struct task_struct *p) { } + +static void task_tick_cache(struct rq *rq, struct task_struct *p) { } + +static inline int get_pref_llc(struct task_struct *p, + struct mm_struct *mm) +{ + return -1; +} + +static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {} + +static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {} + +#endif /* CONFIG_SCHED_CACHE */ + /* * Used by other classes to account runtime. */ @@ -3038,6 +3616,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + long __maybe_unused new_fp; struct numa_group *ng; /* @@ -3112,6 +3691,31 @@ static void task_numa_placement(struct task_struct *p) ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } +#ifdef CONFIG_SCHED_CACHE + /* + * Per task p->numa_faults[mem_idx] converges, + * so the accumulation of each task's faults + * converges too - Given the number of threads, + * it cannot overflow an unsigned long. + * Racy with concurrent updates from other threads + * sharing this mm. Acceptable since footprint is a + * heuristic and occasional lost updates are tolerable. + * + * If a task exits, its corresponding footprint must + * be subtracted from the mm->sc_stat.footprint, otherwise + * the mm->sc_stat.footprint will not converge: + * the exiting thread's footprint remains unchanged/undecayed + * in mm->sc_stat.footprint. See exit_mm(). + * + * Lost updates and unsynchronized subtraction + * in exit_mm() can cause footprint + diff to + * go negative. Clamp to zero to prevent the + * unsigned footprint from wrapping. + */ + new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff; + WRITE_ONCE(p->mm->sc_stat.footprint, + max(new_fp, 0L)); +#endif } if (!ng) { @@ -3836,9 +4440,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (entity_is_task(se)) { + struct task_struct *p = task_of(se); struct rq *rq = rq_of(cfs_rq); - account_numa_enqueue(rq, task_of(se)); + account_numa_enqueue(rq, p); + account_llc_enqueue(rq, p); list_add(&se->group_node, &rq->cfs_tasks); } cfs_rq->nr_queued++; @@ -3849,7 +4455,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); if (entity_is_task(se)) { - account_numa_dequeue(rq_of(cfs_rq), task_of(se)); + struct task_struct *p = task_of(se); + struct rq *rq = rq_of(cfs_rq); + + account_numa_dequeue(rq, p); + account_llc_dequeue(rq, p); list_del_init(&se->group_node); } cfs_rq->nr_queued--; @@ -9617,6 +10227,16 @@ enum group_type { */ group_imbalanced, /* + * There are tasks running on non-preferred LLC, possible to move + * them to their preferred LLC without creating too much imbalance. + * The priority of group_llc_balance is lower than that of + * group_overloaded and higher than that of all other group types. + * This is because group_llc_balance may exacerbate load imbalance. + * If the LLC balancing attempt fails, the nr_balance_failed + * mechanism will trigger other group types to rebalance the load. + */ + group_llc_balance, + /* * The CPU is overloaded and can't provide expected CPU cycles to all * tasks. */ @@ -9627,7 +10247,8 @@ enum migration_type { migrate_load = 0, migrate_util, migrate_task, - migrate_misfit + migrate_misfit, + migrate_llc_task }; #define LBF_ALL_PINNED 0x01 @@ -9635,6 +10256,7 @@ enum migration_type { #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 #define LBF_ACTIVE_LB 0x10 +#define LBF_LLC_PINNED 0x20 struct lb_env { struct sched_domain *sd; @@ -9792,6 +10414,298 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ return 0; } +#ifdef CONFIG_SCHED_CACHE +/* + * The margin used when comparing LLC utilization with CPU capacity. + * It determines the LLC load level where active LLC aggregation is + * done. + * Derived from fits_capacity(). + * + * (default: ~50%, tunable via debugfs) + */ +static bool fits_llc_capacity(unsigned long util, unsigned long max) +{ + u32 aggr_pct = llc_overaggr_pct; + + /* + * For single core systems, raise the aggregation + * threshold to accommodate more tasks. + */ + if (cpu_smt_num_threads == 1) + aggr_pct = (aggr_pct * 3 / 2); + + return util * 100 < max * aggr_pct; +} + +/* + * The margin used when comparing utilization. + * is 'util1' noticeably greater than 'util2' + * Derived from capacity_greater(). + * Bias is in perentage. + */ +/* Allows dst util to be bigger than src util by up to bias percent */ +#define util_greater(util1, util2) \ + ((util1) * 100 > (util2) * (100 + llc_imb_pct)) + +static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +{ + struct sched_domain_shared *sd_share; + + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + if (!sd_share) + return false; + + *util = READ_ONCE(sd_share->util_avg); + *cap = READ_ONCE(sd_share->capacity); + + return true; +} + +/* + * Decision matrix according to the LLC utilization. To + * decide whether we can do task aggregation across LLC. + * + * By default, 50% is the threshold for treating the LLC + * as busy. The reason for choosing 50% is to avoid saturation + * of SMT-2, and it is also a safe cutoff for other SMT-n + * platforms. SMT-1 has higher threshold because it is + * supposed to accommodate more tasks, see fits_llc_capacity(). + * + * 20% is the utilization imbalance percentage to decide + * if the preferred LLC is busier than the non-preferred LLC. + * 20 is a little higher than the LLC domain's imbalance_pct + * 17. The hysteresis is used to avoid task bouncing between the + * preferred LLC and the non-preferred LLC, and it will + * be turned into tunable debugfs. + * + * 1. moving towards the preferred LLC, dst is the preferred + * LLC, src is not. + * + * src \ dst 30% 40% 50% 60% + * 30% Y Y Y N + * 40% Y Y Y Y + * 50% Y Y G G + * 60% Y Y G G + * + * 2. moving out of the preferred LLC, src is the preferred + * LLC, dst is not: + * + * src \ dst 30% 40% 50% 60% + * 30% N N N N + * 40% N N N N + * 50% N N G G + * 60% Y N G G + * + * src : src_util + * dst : dst_util + * Y : Yes, migrate + * N : No, do not migrate + * G : let the Generic load balance to even the load. + * + * The intention is that if both LLCs are quite busy, cache aware + * load balance should not be performed, and generic load balance + * should take effect. However, if one is busy and the other is not, + * the preferred LLC capacity(50%) and imbalance criteria(20%) should + * be considered to determine whether LLC aggregation should be + * performed to bias the load towards the preferred LLC. + */ + +/* migration decision, 3 states are orthogonal. */ +enum llc_mig { + mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */ + mig_llc, /* Y: Do LLC preference based migration */ + mig_unrestricted /* G: Don't restrict generic load balance migration */ +}; + +/* + * Check if task can be moved from the source LLC to the + * destination LLC without breaking cache aware preferrence. + * src_cpu and dst_cpu are arbitrary CPUs within the source + * and destination LLCs, respectively. + */ +static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, + unsigned long tsk_util, + bool to_pref) +{ + unsigned long src_util, dst_util, src_cap, dst_cap; + + if (!get_llc_stats(src_cpu, &src_util, &src_cap) || + !get_llc_stats(dst_cpu, &dst_util, &dst_cap)) + return mig_unrestricted; + + src_util = src_util < tsk_util ? 0 : src_util - tsk_util; + dst_util = dst_util + tsk_util; + + if (!fits_llc_capacity(dst_util, dst_cap) && + !fits_llc_capacity(src_util, src_cap)) + return mig_unrestricted; + + if (to_pref) { + /* + * Don't migrate if we will get preferred LLC too + * heavily loaded and if the dest is much busier + * than the src, in which case migration will + * increase the imbalance too much. + */ + if (!fits_llc_capacity(dst_util, dst_cap) && + util_greater(dst_util, src_util)) + return mig_forbid; + } else { + /* + * Don't migrate if we will leave preferred LLC + * too idle, or if this migration leads to the + * non-preferred LLC falls within sysctl_aggr_imb percent + * of preferred LLC, leading to migration again + * back to preferred LLC. + */ + if (fits_llc_capacity(src_util, src_cap) || + !util_greater(src_util, dst_util)) + return mig_forbid; + } + return mig_llc; +} + +/* + * Check if task p can migrate from source LLC to + * destination LLC in terms of cache aware load balance. + */ +static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + struct task_struct *p) +{ + struct mm_struct *mm; + bool to_pref; + int cpu; + + mm = p->mm; + if (!mm) + return mig_unrestricted; + + cpu = READ_ONCE(mm->sc_stat.cpu); + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + + /* skip cache aware load balance for too many threads */ + if (invalid_llc_nr(mm, p, dst_cpu) || + exceed_llc_capacity(mm, dst_cpu)) { + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); + return mig_unrestricted; + } + + if (cpus_share_cache(dst_cpu, cpu)) + to_pref = true; + else if (cpus_share_cache(src_cpu, cpu)) + to_pref = false; + else + return mig_unrestricted; + + return can_migrate_llc(src_cpu, dst_cpu, + task_util(p), to_pref); +} + +/* + * Check if active load balance breaks LLC locality in + * terms of cache aware load balance. The load level and + * imbalance do not warrant breaking LLC preference per + * the can_migrate_llc() policy. Here, the benefit of + * LLC locality outweighs the power efficiency gained from + * migrating the only runnable task away. + */ +static inline bool +alb_break_llc(struct lb_env *env) +{ + if (!sched_cache_enabled()) + return false; + + if (cpus_share_cache(env->src_cpu, env->dst_cpu)) + return false; + /* + * All tasks prefer to stay on their current CPU. + * Do not pull a task from its preferred CPU if: + * 1. It is the only task running and does not exceed + * imbalance allowance; OR + * 2. Migrating it away from its preferred LLC would violate + * the cache-aware scheduling policy. + */ + if (env->src_rq->nr_pref_llc_running && + env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) { + unsigned long util = 0; + struct task_struct *cur; + + if (env->src_rq->nr_running <= 1) + return true; + + cur = rcu_dereference_all(env->src_rq->curr); + if (cur && cur->sched_class == &fair_sched_class) + util = task_util(cur); + + if (can_migrate_llc(env->src_cpu, env->dst_cpu, + util, false) == mig_forbid) + return true; + } + + return false; +} + +/* + * Check if migrating task p from env->src_cpu to + * env->dst_cpu breaks LLC localiy. + */ +static bool migrate_degrades_llc(struct task_struct *p, struct lb_env *env) +{ + if (!sched_cache_enabled()) + return false; + + if (task_has_sched_core(p)) + return false; + /* + * Skip over tasks that would degrade LLC locality; + * only when nr_balanced_failed is sufficiently high do we + * ignore this constraint. + * + * Threshold of cache_nice_tries is set to 1 higher + * than nr_balance_failed to avoid excessive task + * migration at the same time. + */ + if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1) + return false; + + /* + * We know the env->src_cpu has some tasks prefer to + * run on env->dst_cpu, skip the tasks do not prefer + * env->dst_cpu, and find the one that prefers. + */ + if (env->migration_type == migrate_llc_task && + READ_ONCE(p->preferred_llc) != llc_id(env->dst_cpu)) + return true; + + if (can_migrate_llc_task(env->src_cpu, + env->dst_cpu, p) != mig_forbid) + return false; + + return true; +} + +#else +static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +{ + return false; +} + +static inline bool +alb_break_llc(struct lb_env *env) +{ + return false; +} + +static inline bool +migrate_degrades_llc(struct task_struct *p, struct lb_env *env) +{ + return false; +} +#endif /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -9888,10 +10802,29 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 1; degrades = migrate_degrades_locality(p, env); - if (!degrades) + if (!degrades) { + /* + * If the NUMA locality is not broken, + * further check if migration would hurt + * LLC locality. + */ + if (migrate_degrades_llc(p, env)) { + /* + * If regular load balancing fails to pull a task + * due to LLC locality, this is expected behavior + * and we set LBF_LLC_PINNED so we don't increase + * nr_balance_failed unecessarily. + */ + if (env->migration_type != migrate_llc_task) + env->flags |= LBF_LLC_PINNED; + + return 0; + } + hot = task_hot(p, env); - else + } else { hot = degrades > 0; + } if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (hot) @@ -10053,6 +10986,10 @@ static int detach_tasks(struct lb_env *env) env->imbalance = 0; break; + + case migrate_llc_task: + env->imbalance--; + break; } detach_task(p, env); @@ -10331,12 +11268,16 @@ struct sg_lb_stats { enum group_type group_type; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; #endif +#ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_dst_llc; +#endif }; /* @@ -10594,6 +11535,9 @@ group_type group_classify(unsigned int imbalance_pct, if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; + if (sgs->group_llc_balance) + return group_llc_balance; + if (sg_imbalanced(group)) return group_imbalanced; @@ -10748,6 +11692,105 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) return check_cpu_capacity(rq, sd); } +#ifdef CONFIG_SCHED_CACHE +/* + * Record the statistics for this scheduler group for later + * use. These values guide load balancing on aggregating tasks + * to a LLC. + */ +static void record_sg_llc_stats(struct lb_env *env, + struct sg_lb_stats *sgs, + struct sched_group *group) +{ + struct sched_domain_shared *sd_share; + int cpu; + + if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE) + return; + + /* Only care about sched domain spanning multiple LLCs */ + if (env->sd->child != rcu_dereference_all(per_cpu(sd_llc, env->dst_cpu))) + return; + + /* + * At this point we know this group spans a LLC domain. + * Record the statistic of this group in its corresponding + * shared LLC domain. + * Note: sd_share cannot be obtained via sd->child->shared, + * because the latter refers to the domain that covers the + * local group. Instead, sd_share should be located using + * the first CPU of the LLC group. + */ + cpu = cpumask_first(sched_group_span(group)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + if (!sd_share) + return; + + if (READ_ONCE(sd_share->util_avg) != sgs->group_util) + WRITE_ONCE(sd_share->util_avg, sgs->group_util); + + if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) + WRITE_ONCE(sd_share->capacity, sgs->group_capacity); +} + +/* + * Do LLC balance on sched group that contains LLC, and have tasks preferring + * to run on LLC in idle dst_cpu. + */ +static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + if (!sched_cache_enabled()) + return false; + + if (env->sd->flags & SD_SHARE_LLC) + return false; + + /* + * Skip cache aware tagging if nr_balanced_failed is sufficiently high. + * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed + * to avoid excessive task migration at the same time. + */ + if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1) + return false; + + if (sgs->nr_pref_dst_llc && + can_migrate_llc(cpumask_first(sched_group_span(group)), + env->dst_cpu, 0, true) == mig_llc) + return true; + + return false; +} + +static bool update_llc_busiest(struct lb_env *env, + struct sg_lb_stats *busiest, + struct sg_lb_stats *sgs) +{ + /* + * There are more tasks that want to run on dst_cpu's LLC. + */ + return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc; +} +#else +static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ +} + +static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + return false; +} + +static bool update_llc_busiest(struct lb_env *env, + struct sg_lb_stats *busiest, + struct sg_lb_stats *sgs) +{ + return false; +} +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -10784,6 +11827,20 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (cpu_overutilized(i)) sgs->group_overutilized = 1; +#ifdef CONFIG_SCHED_CACHE + if (sched_cache_enabled()) { + struct sched_domain *sd_tmp; + int dst_llc; + + dst_llc = llc_id(env->dst_cpu); + if (llc_id(i) != dst_llc) { + sd_tmp = rcu_dereference_all(rq->sd); + if (sd_tmp && (unsigned int)dst_llc < sd_tmp->llc_max) + sgs->nr_pref_dst_llc += sd_tmp->llc_counts[dst_llc]; + } + } +#endif + /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -10824,17 +11881,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; - /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->idle && sgs->sum_h_nr_running && - sched_group_asym(env, sgs, group)) - sgs->group_asym_packing = 1; + if (!local_group) { + /* Check if dst CPU is idle and preferred to this group */ + if (env->idle && sgs->sum_h_nr_running && + sched_group_asym(env, sgs, group)) + sgs->group_asym_packing = 1; + + /* Check for loaded SMT group to be balanced to dst CPU */ + if (smt_balance(env, sgs, group)) + sgs->group_smt_balance = 1; - /* Check for loaded SMT group to be balanced to dst CPU */ - if (!local_group && smt_balance(env, sgs, group)) - sgs->group_smt_balance = 1; + /* Check for tasks in this group can be moved to their preferred LLC */ + if (llc_balance(env, sgs, group)) + sgs->group_llc_balance = 1; + } sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + record_sg_llc_stats(env, sgs, group); /* Computing avg_load makes sense only when group is overloaded */ if (sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / @@ -10899,6 +11963,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, /* Select the overloaded group with highest avg_load. */ return sgs->avg_load > busiest->avg_load; + case group_llc_balance: + /* Select the group with most tasks preferring dst LLC */ + return update_llc_busiest(env, busiest, sgs); + case group_imbalanced: /* * Select the 1st imbalanced group as we don't have any way to @@ -11161,6 +12229,7 @@ static bool update_pick_idlest(struct sched_group *idlest, return false; break; + case group_llc_balance: case group_imbalanced: case group_asym_packing: case group_smt_balance: @@ -11293,6 +12362,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int return NULL; break; + case group_llc_balance: case group_imbalanced: case group_asym_packing: case group_smt_balance: @@ -11547,6 +12617,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } +#ifdef CONFIG_SCHED_CACHE + if (busiest->group_type == group_llc_balance) { + /* Move a task that prefer local LLC */ + env->migration_type = migrate_llc_task; + env->imbalance = 1; + return; + } +#endif + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -11793,7 +12872,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) * group's child domain. */ if (sds.prefer_sibling && local->group_type == group_has_spare && - sibling_imbalance(env, &sds, busiest, local) > 1) + (busiest->group_type == group_llc_balance || + sibling_imbalance(env, &sds, busiest, local) > 1)) goto force_balance; if (busiest->group_type != group_overloaded) { @@ -11852,7 +12932,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, { struct rq *busiest = NULL, *rq; unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int __maybe_unused busiest_pref_llc = 0; + struct sched_domain __maybe_unused *sd_tmp; unsigned int busiest_nr = 0; + int __maybe_unused dst_llc; int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { @@ -11980,6 +13063,23 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, break; + case migrate_llc_task: +#ifdef CONFIG_SCHED_CACHE + sd_tmp = rcu_dereference_all(rq->sd); + dst_llc = llc_id(env->dst_cpu); + + if (sd_tmp && (unsigned)dst_llc < sd_tmp->llc_max) { + unsigned int this_pref_llc = + sd_tmp->llc_counts[dst_llc]; + + if (busiest_pref_llc < this_pref_llc) { + busiest_pref_llc = this_pref_llc; + busiest = rq; + } + } +#endif + break; + } } @@ -12031,6 +13131,9 @@ static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; + if (alb_break_llc(env)) + return 0; + if (asym_active_balance(env)) return 1; @@ -12050,7 +13153,8 @@ static int need_active_balance(struct lb_env *env) return 1; } - if (env->migration_type == migrate_misfit) + if (env->migration_type == migrate_misfit || + env->migration_type == migrate_llc_task) return 1; return 0; @@ -12143,6 +13247,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd case migrate_misfit: __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); break; + case migrate_llc_task: + break; } } @@ -12346,9 +13452,16 @@ more_balance: * * Similarly for migration_misfit which is not related to * load/util migration, don't pollute nr_balance_failed. + * + * The same for cache aware scheduling's allowance for + * load imbalance. If regular load balance does not + * migrate task due to LLC locality, it is a expected + * behavior and don't pollute nr_balance_failed. + * See can_migrate_task(). */ if (idle != CPU_NEWLY_IDLE && - env.migration_type != migrate_misfit) + env.migration_type != migrate_misfit && + !(env.flags & LBF_LLC_PINNED)) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -13756,6 +14869,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + task_tick_cache(rq, curr); + update_misfit_status(curr, rq); check_update_overutilized_status(task_rq(curr)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bfb4b47c021b..8eb8f83db6b0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1187,6 +1187,12 @@ struct rq { struct scx_rq scx; struct sched_dl_entity ext_server; #endif +#ifdef CONFIG_SCHED_CACHE + raw_spinlock_t cpu_epoch_lock ____cacheline_aligned; + u64 cpu_runtime; + unsigned long cpu_epoch; + unsigned long cpu_epoch_next; +#endif struct sched_dl_entity fair_server; @@ -1199,6 +1205,12 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int numa_migrate_on; #endif + +#ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_llc_running; + unsigned int nr_llc_running; +#endif + /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on @@ -1546,6 +1558,14 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); +static inline bool task_has_sched_core(struct task_struct *p) +{ + if (sched_core_disabled()) + return false; + + return !!p->core_cookie; +} + #else /* !CONFIG_SCHED_CORE: */ static inline bool sched_core_enabled(struct rq *rq) @@ -1586,6 +1606,11 @@ static inline bool sched_group_cookie_match(struct rq *rq, return true; } +static inline bool task_has_sched_core(struct task_struct *p) +{ + return false; +} + #endif /* !CONFIG_SCHED_CORE */ #ifdef CONFIG_RT_GROUP_SCHED @@ -2076,6 +2101,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p) #endif /* !CONFIG_NUMA_BALANCING */ +int task_llc(const struct task_struct *p); + static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, @@ -2164,6 +2191,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(int, sd_share_id); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -4031,6 +4059,29 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ +#ifdef CONFIG_SCHED_CACHE +DECLARE_STATIC_KEY_FALSE(sched_cache_present); +DECLARE_STATIC_KEY_FALSE(sched_cache_active); +extern int sysctl_sched_cache_user; +extern unsigned int llc_aggr_tolerance; +extern unsigned int llc_epoch_period; +extern unsigned int llc_epoch_affinity_timeout; +extern unsigned int llc_imb_pct; +extern unsigned int llc_overaggr_pct; + +static inline bool sched_cache_enabled(void) +{ + return static_branch_unlikely(&sched_cache_active); +} + +extern void sched_cache_active_set(void); + +#endif + +void sched_domains_free_llc_id(int cpu); + +extern void init_sched_mm(struct task_struct *p); + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); static inline diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f96d50131495..dbfd9657f897 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -19,8 +19,10 @@ void sched_domains_mutex_unlock(void) } /* Protected by sched_domains_mutex: */ +static cpumask_var_t sched_domains_llc_id_allocmask; static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; +int max_lid; static int __init sched_debug_setup(char *str) { @@ -632,6 +634,11 @@ static void destroy_sched_domain(struct sched_domain *sd) if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) kfree(sd->shared); + +#ifdef CONFIG_SCHED_CACHE + /* only the bottom sd has llc_counts array */ + kfree(sd->llc_counts); +#endif kfree(sd); } @@ -663,8 +670,9 @@ static void destroy_sched_domains(struct sched_domain *sd) */ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); -DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_llc_id) = -1; DEFINE_PER_CPU(int, sd_share_id); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -680,38 +688,19 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1; - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); - /* - * The shared object is attached to sd_asym_cpucapacity only when the - * asym domain is non-overlapping (i.e., not built from SD_NUMA). - * On overlapping (NUMA) asym domains we fall back to letting the - * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL - * here. - */ - if (sd && sd->shared) - sds = sd->shared; - - rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); - sd = highest_flag_domain(cpu, SD_SHARE_LLC); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - /* - * If sd_asym_cpucapacity didn't claim the shared object, - * sd_llc must have one linked. - */ - if (!sds) { - WARN_ON_ONCE(!sd->shared); - sds = sd->shared; - } + /* If sd_llc exists, sd_llc_shared should exist too. */ + WARN_ON_ONCE(!sd->shared); + sds = sd->shared; } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; - per_cpu(sd_llc_id, cpu) = id; - rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds); + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_CLUSTER); if (sd) @@ -729,6 +718,20 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); + + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); + /* + * The shared object is attached to sd_asym_cpucapacity only when the + * asym domain is non-overlapping (i.e., not built from SD_NUMA). + * On overlapping (NUMA) asym domains we fall back to letting the + * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL + * here. + */ + if (sd && sd->shared) + sds = sd->shared; + + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); + rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds); } /* @@ -777,10 +780,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; - destroy_sched_domain(tmp); + if (sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_SCHED_CACHE + /* move buffer to parent as child is being destroyed */ + sd->llc_counts = tmp->llc_counts; + sd->llc_max = tmp->llc_max; + sd->llc_bytes = tmp->llc_bytes; + /* make sure destroy_sched_domain() does not free it */ + tmp->llc_counts = NULL; + tmp->llc_max = 0; + tmp->llc_bytes = 0; +#endif /* * sched groups hold the flags of the child sched * domain for convenience. Clear such flags since @@ -792,6 +805,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + + destroy_sched_domain(tmp); } sched_domain_debug(sd, cpu); @@ -819,6 +834,239 @@ enum s_alloc { sa_none, }; +#ifdef CONFIG_SCHED_CACHE +/* hardware support for cache aware scheduling */ +DEFINE_STATIC_KEY_FALSE(sched_cache_present); +/* + * Indicator of whether cache aware scheduling + * is active, used by the scheduler. + */ +DEFINE_STATIC_KEY_FALSE(sched_cache_active); +/* user wants cache aware scheduling [0 or 1] */ +int sysctl_sched_cache_user = 1; + +/* + * Get the effective LLC size in bytes that @cpu's bottom sched_domain + * can use. A CPU within a cpuset partition can only use a proportion + * of the physical LLC, scaled by the ratio of the partition's span + * weight to the hardware LLC sharing weight. @sd should be the + * topmost domain with SD_SHARE_LLC. + * + * Returns 0 if cacheinfo is not yet populated. This happens during + * early boot when build_sched_domains() runs before the generic + * cacheinfo framework has been initialized (cacheinfo_cpu_online() + * is a device_initcall cpuhp callback). In that case, + * cacheinfo_cpu_online() will later call sched_update_llc_bytes() + * to fill in the bottom domain's llc_bytes once the cache attributes + * are available. + */ +static unsigned long get_effective_llc_bytes(int cpu, + struct sched_domain *sd) +{ + struct cacheinfo *ci; + unsigned int hw_weight; + + ci = get_cpu_cacheinfo_llc(cpu); + if (!ci) + return 0; + + hw_weight = cpumask_weight(&ci->shared_cpu_map); + if (!hw_weight) + return 0; + + return div_u64((u64)ci->size * sd->span_weight, hw_weight); +} + +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + struct sched_domain *sd, *top_llc, *parent; + unsigned int *p; + int i; + + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (!sd) + goto err; + + p = kcalloc_node(max_lid + 1, sizeof(unsigned int), + GFP_KERNEL, cpu_to_node(i)); + if (!p) + goto err; + + top_llc = sd; + /* + * Find the topmost SD_SHARE_LLC domain. + * Not yet attached to the CPU, so per_cpu(sd_llc, i) + * can not be used. + */ + while ((parent = rcu_dereference_protected(top_llc->parent, true)) && + (parent->flags & SD_SHARE_LLC)) + top_llc = parent; + + if (top_llc->flags & SD_SHARE_LLC) { + sd->llc_max = max_lid + 1; + sd->llc_counts = p; + sd->llc_bytes = get_effective_llc_bytes(i, top_llc); + } else { + /* avoid memory leak */ + kfree(p); + } + } + + return true; +err: + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (sd) { + kfree(sd->llc_counts); + sd->llc_counts = NULL; + sd->llc_max = 0; + sd->llc_bytes = 0; + } + } + + return false; +} + +/* + * Enable/disable cache aware scheduling according to + * user input and the presence of hardware support. + */ +static void _sched_cache_active_set(void) +{ + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + + /* hardware does not support */ + if (!static_branch_likely(&sched_cache_present)) { + static_branch_disable_cpuslocked(&sched_cache_active); + if (sched_debug()) + pr_info("%s: cache aware scheduling not supported on this platform\n", __func__); + return; + } + + /* + * user wants it or not ? + * TBD: read before writing the static key. + * It is not in the critical path, leave as-is + * for now. + */ + if (sysctl_sched_cache_user) { + static_branch_enable_cpuslocked(&sched_cache_active); + if (sched_debug()) + pr_info("%s: enabling cache aware scheduling\n", __func__); + } else { + static_branch_disable_cpuslocked(&sched_cache_active); + if (sched_debug()) + pr_info("%s: disabling cache aware scheduling\n", __func__); + } +} + +/* used by debugfs */ +void sched_cache_active_set(void) +{ + cpus_read_lock(); + sched_domains_mutex_lock(); + _sched_cache_active_set(); + sched_domains_mutex_unlock(); + cpus_read_unlock(); +} + +/* + * Update the bottom sched_domain's llc_bytes for @cpu and all its + * LLC siblings. Called from cacheinfo_cpu_online() or + * cacheinfo_cpu_pre_down() with cpu hotplug lock held. + * + * Note: get_effective_llc_bytes() returns 0 on PowerPC. + * thus cache aware scheduling is disabled on PowerPC for + * now. PowerPC does not use the generic cacheinfo framework -- + * it has its own cacheinfo with a separate struct cache hierarchy + * and does not populates the per-CPU struct cpu_cacheinfo array + * that get_cpu_cacheinfo_llc() reads. + */ +void sched_update_llc_bytes(unsigned int cpu) +{ + struct sched_domain *sd, *sdp; + unsigned int i; + + sched_domains_mutex_lock(); + + sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu)); + if (!sdp) + goto unlock; + + /* + * ci->shared_cpu_map is built incrementally as CPUs come + * online, so the first CPU in an LLC initially sees + * hw_weight == 1 and computes an inflated llc_bytes in + * get_effective_llc_bytes(). Re-evaluating every LLC + * sibling on each online event corrects this once the full + * shared_cpu_map is known. + */ + for_each_cpu(i, sched_domain_span(sdp)) { + sd = rcu_dereference_sched_domain(cpu_rq(i)->sd); + if (sd) + sd->llc_bytes = get_effective_llc_bytes(i, sdp); + } + +unlock: + sched_domains_mutex_unlock(); +} + +static void sched_cache_set(bool has_multi_llcs) +{ + /* + * TBD: check before writing to it. sched domain rebuild + * is not in the critical path, leave as-is for now. + */ + if (has_multi_llcs) + static_branch_enable_cpuslocked(&sched_cache_present); + else + static_branch_disable_cpuslocked(&sched_cache_present); + + _sched_cache_active_set(); +} +#else +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + return false; +} +static inline void sched_cache_set(bool has_multi_llcs) { } +#endif + +/* + * Return true if @sd belongs to an LLC group whose enclosing + * partition spans more than one LLC. @sd must be the topmost + * SD_SHARE_LLC domain. + * + * Any duplicated parent domains with the same span as @sd are + * skipped: before cpu_attach_domain() degeneration these still + * exist, after degeneration the loop is a no-op. This makes the + * helper usable both during sched domain build and against an + * already-attached domain tree. + * + * Note: For systems with a single LLC per node, cache-aware + * scheduling is still enabled when multiple nodes exist. + * However, NUMA balancing decisions take precedence over + * cache-aware scheduling. Conversely, if there is only one + * LLC per partition, cache-aware scheduling should be disabled. + */ +static bool sd_in_multi_llcs(struct sched_domain *sd) +{ + struct sched_domain *sdp = sd->parent; + + /* it does not make sense to aggregate to 1 CPU */ + if (sd->span_weight == 1) + return false; + + while (sdp && sdp->span_weight == sd->span_weight) + sdp = sdp->parent; + + return !!sdp; +} + /* * Return the canonical balance CPU for this group, this is the first CPU * of this group that's also in the balance mask. @@ -1803,6 +2051,11 @@ const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu { return cpu_coregroup_mask(cpu); } + +#define llc_mask(cpu) cpu_coregroup_mask(cpu) + +#else +#define llc_mask(cpu) cpumask_of(cpu) #endif const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) @@ -2711,14 +2964,71 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) return true; } +static int __sched_domains_alloc_llc_id(void) +{ + int lid, max; + + lockdep_assert_held(&sched_domains_mutex); + + lid = cpumask_first_zero(sched_domains_llc_id_allocmask); + /* + * llc_id space should never grow larger than the + * possible number of CPUs in the system. + */ + if (lid >= nr_cpu_ids) + return -1; + + __cpumask_set_cpu(lid, sched_domains_llc_id_allocmask); + max = cpumask_last(sched_domains_llc_id_allocmask); + if (max > max_lid) + max_lid = max; + + return lid; +} + +static void __sched_domains_free_llc_id(int cpu) +{ + int i, lid, max; + + lockdep_assert_held(&sched_domains_mutex); + + lid = per_cpu(sd_llc_id, cpu); + if (lid == -1 || lid >= nr_cpu_ids) + return; + + per_cpu(sd_llc_id, cpu) = -1; + + for_each_cpu(i, llc_mask(cpu)) { + /* An online CPU owns the llc_id. */ + if (per_cpu(sd_llc_id, i) == lid) + return; + } + + __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask); + + max = cpumask_last(sched_domains_llc_id_allocmask); + /* shrink max lid to save memory */ + if (max < max_lid) + max_lid = max; +} + +void sched_domains_free_llc_id(int cpu) +{ + sched_domains_mutex_lock(); + __sched_domains_free_llc_id(cpu); + sched_domains_mutex_unlock(); +} + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ static int -build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) +build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr, + bool *multi_llcs) { enum s_alloc alloc_state = sa_none; + bool has_multi_llcs = false; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; @@ -2736,6 +3046,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int lid; sd = NULL; for_each_sd_topology(tl) { @@ -2749,6 +3060,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } + + lid = per_cpu(sd_llc_id, i); + if (lid == -1) { + /* try to reuse the llc_id of its siblings */ + for (int j = cpumask_first(llc_mask(i)); + j < nr_cpu_ids; + j = cpumask_next(j, llc_mask(i))) { + if (i == j) + continue; + + lid = per_cpu(sd_llc_id, j); + + if (lid != -1) { + per_cpu(sd_llc_id, i) = lid; + + break; + } + } + + /* a new LLC is detected */ + if (lid == -1) + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id(); + } } if (WARN_ON(!topology_span_sane(cpu_map))) @@ -2769,33 +3103,31 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } for_each_cpu(i, cpu_map) { - bool asym_claimed = false; - sd = *per_cpu_ptr(d.sd, i); if (!sd) continue; if (has_asym) - asym_claimed = claim_asym_sched_domain_shared(&d, i); + claim_asym_sched_domain_shared(&d, i); /* First, find the topmost SD_SHARE_LLC domain */ while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) sd = sd->parent; if (sd->flags & SD_SHARE_LLC) { - /* - * Initialize the sd->shared for SD_SHARE_LLC unless - * the asym path above already claimed it. - */ - if (!asym_claimed) - init_sched_domain_shared(&d, sd); + init_sched_domain_shared(&d, sd); /* * In presence of higher domains, adjust the * NUMA imbalance stats for the hierarchy. */ - if (IS_ENABLED(CONFIG_NUMA) && sd->parent) - adjust_numa_imbalance(sd); + if (sd->parent) { + if (IS_ENABLED(CONFIG_NUMA)) + adjust_numa_imbalance(sd); + + if (sd_in_multi_llcs(sd)) + has_multi_llcs = true; + } } } @@ -2810,6 +3142,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att init_sched_groups_capacity(i, sd); } + alloc_sd_llc(cpu_map, &d); + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { @@ -2834,6 +3168,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ret = 0; error: + *multi_llcs = has_multi_llcs; __free_domain_allocs(&d, alloc_state, cpu_map); return ret; @@ -2896,8 +3231,10 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) */ int __init sched_init_domains(const struct cpumask *cpu_map) { + bool multi_llcs; int err; + zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL); zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); @@ -2909,7 +3246,9 @@ int __init sched_init_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); - err = build_sched_domains(doms_cur[0], NULL); + err = build_sched_domains(doms_cur[0], NULL, &multi_llcs); + if (!err) + sched_cache_set(multi_llcs); return err; } @@ -2982,6 +3321,7 @@ static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; + bool has_multi_llcs = false, multi_llcs; int i, j, n; int new_topology; @@ -3031,14 +3371,41 @@ match1: for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && - dattrs_equal(dattr_new, i, dattr_cur, j)) + dattrs_equal(dattr_new, i, dattr_cur, j)) { + /* + * Reused partition has to be taken care + * of here, because there could be a corner + * case that if the reused partition is skipped + * and only new partition is considered, an + * incorrect has_multi_llcs would be set. For + * example: + * If the only multi-LLC partition is reused + * and a new single-LLC partition is built, + * sched_cache_set(false) disables cache-aware + * scheduling globally despite the reused + * multi-LLC partition still being active. + */ + struct sched_domain *sd; + int cpu = cpumask_first(doms_cur[j]); + + guard(rcu)(); + sd = rcu_dereference(cpu_rq(cpu)->sd); + while (sd && sd->parent && (sd->parent->flags & SD_SHARE_LLC)) + sd = sd->parent; + if (sd && (sd->flags & SD_SHARE_LLC) && sd->parent && + sd_in_multi_llcs(sd)) + has_multi_llcs = true; goto match2; + } } /* No match - add a new doms_new */ - build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL, + &multi_llcs); + has_multi_llcs |= multi_llcs; match2: ; } + sched_cache_set(has_multi_llcs); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) /* Build perf domains: */ |
