From f025ef275388742643a2c33f00a0d9c0af3112ee Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 1 Apr 2026 14:52:15 -0700 Subject: sched/cache: Record per LLC utilization to guide cache aware scheduling decisions When a system becomes busy and a process's preferred LLC is saturated with too many threads, tasks within that LLC migrate frequently. These in LLC migrations introduce latency and degrade performance. To avoid this, task aggregation should be suppressed when the preferred LLC is overloaded, which requires a metric to indicate LLC utilization. Record per LLC utilization/cpu capacity during periodic load balancing. These statistics will be used in later patches to decide whether tasks should be aggregated into their preferred LLC. Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/a48151b3d57f2a42a5971aaead1b7f81e69229f4.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/sched/topology.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/sched') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 36553e14866d..159716fa0d3a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -68,6 +68,10 @@ struct sched_domain_shared { atomic_t nr_busy_cpus; int has_idle_cores; int nr_idle_scan; +#ifdef CONFIG_SCHED_CACHE + unsigned long util_avg; + unsigned long capacity; +#endif }; struct sched_domain { -- cgit v1.2.3 From a8d0ca0b7f2f7b53565d1e30e509d3d74d1f5460 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:20 -0700 Subject: sched/cache: Introduce per CPU's tasks LLC preference counter The lowest level of sched domain for each CPU is assigned an array where each element tracks the number of tasks preferring a given LLC, indexed from 0 to max_lid. Since each CPU has its dedicated sd, this implies that each CPU will have a dedicated task LLC preference counter. For example, sd->llc_counts[3] = 2 signifies that there are 2 tasks on this runqueue which prefer to run within LLC3. The load balancer can use this information to identify busy runqueues and migrate tasks to their preferred LLC domains. This array will be reallocated at runtime during sched domain rebuild. Introduce the buffer allocation mechanism, and the statistics will be calculated in the subsequent patch. Note: the LLC preference statistics of each CPU are reset on sched domain rebuild and may under count temporarily, until the CPU becomes idle and the count is cleared. This is a trade off to avoid complex data synchronization across sched domain builds. Suggested-by: Peter Zijlstra (Intel) Suggested-by: K Prateek Nayak Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/42e79eceb8cd6be8a032401d481d101913bc5703.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/sched/topology.h | 5 ++++ kernel/sched/topology.c | 62 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 159716fa0d3a..0036d6b4bd67 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -103,6 +103,11 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; +#ifdef CONFIG_SCHED_CACHE + unsigned int llc_max; + unsigned int *llc_counts __counted_by_ptr(llc_max); +#endif + #ifdef CONFIG_SCHEDSTATS /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1200670969bb..8954bf7900ff 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -634,6 +634,11 @@ static void destroy_sched_domain(struct sched_domain *sd) if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) kfree(sd->shared); + +#ifdef CONFIG_SCHED_CACHE + /* only the bottom sd has llc_counts array */ + kfree(sd->llc_counts); +#endif kfree(sd); } @@ -763,10 +768,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; - destroy_sched_domain(tmp); + if (sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_SCHED_CACHE + /* move buffer to parent as child is being destroyed */ + sd->llc_counts = tmp->llc_counts; + sd->llc_max = tmp->llc_max; + /* make sure destroy_sched_domain() does not free it */ + tmp->llc_counts = NULL; + tmp->llc_max = 0; +#endif /* * sched groups hold the flags of the child sched * domain for convenience. Clear such flags since @@ -778,6 +791,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + + destroy_sched_domain(tmp); } sched_domain_debug(sd, cpu); @@ -805,6 +820,49 @@ enum s_alloc { sa_none, }; +#ifdef CONFIG_SCHED_CACHE +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + struct sched_domain *sd; + unsigned int *p; + int i; + + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (!sd) + goto err; + + p = kcalloc_node(max_lid + 1, sizeof(unsigned int), + GFP_KERNEL, cpu_to_node(i)); + if (!p) + goto err; + + sd->llc_max = max_lid + 1; + sd->llc_counts = p; + } + + return true; +err: + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (sd) { + kfree(sd->llc_counts); + sd->llc_counts = NULL; + sd->llc_max = 0; + } + } + + return false; +} +#else +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + return false; +} +#endif + /* * Return the canonical balance CPU for this group, this is the first CPU * of this group that's also in the balance mask. @@ -2828,6 +2886,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att init_sched_groups_capacity(i, sd); } + alloc_sd_llc(cpu_map, &d); + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { -- cgit v1.2.3 From 7030513a08776b2ca70fccd5dfddf7bb5c5c88ba Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:15 -0700 Subject: sched/cache: Calculate the LLC size and store it in sched_domain Cache aware scheduling needs to know the LLC size that a process can use, so as to avoid memory-intensive tasks from being over-aggregated on a single LLC. Introduce a preparation patch to add get_effective_llc_bytes() to get the LLC size that a CPU can use. The function can be further enhanced by subtracting the LLC cache ways reserved by resctrl (CAT in Intel RDT, etc). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/37afee09ff608034da0ce149e72d33b6f4698edf.1778703694.git.tim.c.chen@linux.intel.com --- drivers/base/cacheinfo.c | 23 ++++++++++ include/linux/cacheinfo.h | 1 + include/linux/sched/topology.h | 7 +++ kernel/sched/topology.c | 98 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 126 insertions(+), 3 deletions(-) (limited to 'include/linux/sched') diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index 391ac5e3d2f5..70701d3bc81c 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,24 @@ bool last_level_cache_is_valid(unsigned int cpu) } +/* + * Get the cacheinfo of the LLC associated with @cpu. + * Derived from update_per_cpu_data_slice_size_cpu(). + */ +struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu) +{ + struct cacheinfo *llc; + + if (!last_level_cache_is_valid(cpu)) + return NULL; + + llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); + if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED) + return NULL; + + return llc; +} + bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y) { struct cacheinfo *llc_x, *llc_y; @@ -1018,6 +1037,7 @@ static int cacheinfo_cpu_online(unsigned int cpu) goto err; if (cpu_map_shared_cache(true, cpu, &cpu_map)) update_per_cpu_data_slice_size(true, cpu, cpu_map); + sched_update_llc_bytes(cpu); return 0; err: free_cache_attributes(cpu); @@ -1036,6 +1056,9 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu) free_cache_attributes(cpu); if (nr_shared > 1) update_per_cpu_data_slice_size(false, cpu, cpu_map); + + sched_update_llc_bytes(cpu); + return 0; } diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index c8f4f0a0b874..fc879ac4cc4f 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu); int cache_setup_acpi(unsigned int cpu); bool last_level_cache_is_valid(unsigned int cpu); bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y); +struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu); int fetch_cache_info(unsigned int cpu); int detect_cache_attributes(unsigned int cpu); #ifndef CONFIG_ACPI_PPTT diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 0036d6b4bd67..fe09d3268bc9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -106,6 +106,7 @@ struct sched_domain { #ifdef CONFIG_SCHED_CACHE unsigned int llc_max; unsigned int *llc_counts __counted_by_ptr(llc_max); + unsigned long llc_bytes; #endif #ifdef CONFIG_SCHEDSTATS @@ -265,4 +266,10 @@ static inline int task_node(const struct task_struct *p) return cpu_to_node(task_cpu(p)); } +#ifdef CONFIG_SCHED_CACHE +extern void sched_update_llc_bytes(unsigned int cpu); +#else +static inline void sched_update_llc_bytes(unsigned int cpu) { } +#endif + #endif /* _LINUX_SCHED_TOPOLOGY_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9fc99346ef4f..7248a7279abe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -776,9 +776,11 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* move buffer to parent as child is being destroyed */ sd->llc_counts = tmp->llc_counts; sd->llc_max = tmp->llc_max; + sd->llc_bytes = tmp->llc_bytes; /* make sure destroy_sched_domain() does not free it */ tmp->llc_counts = NULL; tmp->llc_max = 0; + tmp->llc_bytes = 0; #endif /* * sched groups hold the flags of the child sched @@ -831,10 +833,42 @@ DEFINE_STATIC_KEY_FALSE(sched_cache_active); /* user wants cache aware scheduling [0 or 1] */ int sysctl_sched_cache_user = 1; +/* + * Get the effective LLC size in bytes that @cpu's bottom sched_domain + * can use. A CPU within a cpuset partition can only use a proportion + * of the physical LLC, scaled by the ratio of the partition's span + * weight to the hardware LLC sharing weight. @sd should be the + * topmost domain with SD_SHARE_LLC. + * + * Returns 0 if cacheinfo is not yet populated. This happens during + * early boot when build_sched_domains() runs before the generic + * cacheinfo framework has been initialized (cacheinfo_cpu_online() + * is a device_initcall cpuhp callback). In that case, + * cacheinfo_cpu_online() will later call sched_update_llc_bytes() + * to fill in the bottom domain's llc_bytes once the cache attributes + * are available. + */ +static unsigned long get_effective_llc_bytes(int cpu, + struct sched_domain *sd) +{ + struct cacheinfo *ci; + unsigned int hw_weight; + + ci = get_cpu_cacheinfo_llc(cpu); + if (!ci) + return 0; + + hw_weight = cpumask_weight(&ci->shared_cpu_map); + if (!hw_weight) + return 0; + + return div_u64((u64)ci->size * sd->span_weight, hw_weight); +} + static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) { - struct sched_domain *sd; + struct sched_domain *sd, *top_llc, *parent; unsigned int *p; int i; @@ -848,8 +882,24 @@ static bool alloc_sd_llc(const struct cpumask *cpu_map, if (!p) goto err; - sd->llc_max = max_lid + 1; - sd->llc_counts = p; + top_llc = sd; + /* + * Find the topmost SD_SHARE_LLC domain. + * Not yet attached to the CPU, so per_cpu(sd_llc, i) + * can not be used. + */ + while ((parent = rcu_dereference_protected(top_llc->parent, true)) && + (parent->flags & SD_SHARE_LLC)) + top_llc = parent; + + if (top_llc->flags & SD_SHARE_LLC) { + sd->llc_max = max_lid + 1; + sd->llc_counts = p; + sd->llc_bytes = get_effective_llc_bytes(i, top_llc); + } else { + /* avoid memory leak */ + kfree(p); + } } return true; @@ -860,6 +910,7 @@ err: kfree(sd->llc_counts); sd->llc_counts = NULL; sd->llc_max = 0; + sd->llc_bytes = 0; } } @@ -919,6 +970,47 @@ void sched_cache_active_set_unlocked(void) { return sched_cache_active_set(false); } + +/* + * Update the bottom sched_domain's llc_bytes for @cpu and all its + * LLC siblings. Called from cacheinfo_cpu_online() or + * cacheinfo_cpu_pre_down() with cpu hotplug lock held. + * + * Note: get_effective_llc_bytes() returns 0 on PowerPC. + * thus cache aware scheduling is disabled on PowerPC for + * now. PowerPC does not use the generic cacheinfo framework -- + * it has its own cacheinfo with a separate struct cache hierarchy + * and does not populates the per-CPU struct cpu_cacheinfo array + * that get_cpu_cacheinfo_llc() reads. + */ +void sched_update_llc_bytes(unsigned int cpu) +{ + struct sched_domain *sd, *sdp; + unsigned int i; + + sched_domains_mutex_lock(); + + sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu)); + if (!sdp) + goto unlock; + + /* + * ci->shared_cpu_map is built incrementally as CPUs come + * online, so the first CPU in an LLC initially sees + * hw_weight == 1 and computes an inflated llc_bytes in + * get_effective_llc_bytes(). Re-evaluating every LLC + * sibling on each online event corrects this once the full + * shared_cpu_map is known. + */ + for_each_cpu(i, sched_domain_span(sdp)) { + sd = rcu_dereference_sched_domain(cpu_rq(i)->sd); + if (sd) + sd->llc_bytes = get_effective_llc_bytes(i, sdp); + } + +unlock: + sched_domains_mutex_unlock(); +} #else static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) -- cgit v1.2.3 From ea19506013ad13685573e4674fbeddb790e27906 Mon Sep 17 00:00:00 2001 From: Yiyang Chen Date: Fri, 15 May 2026 00:05:05 +0800 Subject: sched/clock: Provide !HAVE_UNSTABLE_SCHED_CLOCK stub for sched_clock_stable() When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is disabled, sched_clock() is already assumed to provide stable semantics, but the public header doesn't provide a sched_clock_stable() stub for that case. Add a header stub that always returns true and clean up the duplicate local stub in ring_buffer.c, so callers can use sched_clock_stable() unconditionally. Signed-off-by: Yiyang Chen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Link: https://patch.msgid.link/56e45338858946cd9581b75c8bd45dd37dba52c5.1778773587.git.cyyzero16@gmail.com --- include/linux/sched/clock.h | 5 +++++ kernel/trace/ring_buffer.c | 7 ------- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 196f0ca351a2..39f0a7f94bfc 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -33,6 +33,11 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline int sched_clock_stable(void) +{ + return 1; +} + static inline void sched_clock_tick(void) { } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5326924615a4..02691c3c6dd6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3769,13 +3769,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return skip_time_extend(event); } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) -{ - return true; -} -#endif - static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) -- cgit v1.2.3 From 5bc6ab2d42e545f816def21cfcdb4ba35cc74bf6 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 15 May 2026 22:54:54 +0530 Subject: sched: Simplify ifdeffery around cpu_smt_mask Now, that cpu_smt_mask is defined as cpumask_of(cpu) for CONFIG_SCHED_SMT=n, it is possible to get rid of the ifdeffery. Effectively, - This makes sched_smt_present is defined always - cpumask_weight(cpumask_of(cpu)) == 1. So sched_smt_present_inc/dec will never enable the sched_smt_present. Which is expected. - Paths that were compile-time eliminated become runtime guarded using static keys. - Defines set_idle_cores, test_idle_cores, etc which could likely benefit the CONFIG_SCHED_SMT=n systems to use the same optimizations within the LLC at wakeups. - This will expose sched_smt_present symbol for CONFIG_SCHED_SMT=n. Likely not a concern. - There is a bloat of code CONFIG_SCHED_SMT=n. (NR_CPUS=2048) add/remove: 24/18 grow/shrink: 26/28 up/down: 6396/-3188 (3208) Total: Before=30629880, After=30633088, chg +0.01% - No code bloat for CONFIG_SCHED_SMT=y, which is expected. - Add comments around stop_core_cpuslocked on why ifdefs are not removed. - This leaves the remaining uses of CONFIG_SCHED_SMT mainly for topology building bits which has a policy based decision. Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Acked-by: Tejun Heo Tested-by: K Prateek Nayak Link: https://patch.msgid.link/20260515172456.542799-3-sshegde@linux.ibm.com --- include/linux/sched/smt.h | 4 ---- kernel/sched/core.c | 6 ------ kernel/sched/ext_idle.c | 6 ------ kernel/sched/fair.c | 35 ----------------------------------- kernel/sched/sched.h | 6 ------ kernel/sched/topology.c | 2 -- kernel/stop_machine.c | 5 +++++ kernel/workqueue.c | 4 ---- 8 files changed, 5 insertions(+), 63 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index 166b19af956f..cde6679c0278 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -4,16 +4,12 @@ #include -#ifdef CONFIG_SCHED_SMT extern struct static_key_false sched_smt_present; static __always_inline bool sched_smt_active(void) { return static_branch_likely(&sched_smt_present); } -#else -static __always_inline bool sched_smt_active(void) { return false; } -#endif void arch_smt_update(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b905805bbcbe..3ae5f19c1b7e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8612,18 +8612,14 @@ static void cpuset_cpu_inactive(unsigned int cpu) static inline void sched_smt_present_inc(int cpu) { -#ifdef CONFIG_SCHED_SMT if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_inc_cpuslocked(&sched_smt_present); -#endif } static inline void sched_smt_present_dec(int cpu) { -#ifdef CONFIG_SCHED_SMT if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); -#endif } int sched_cpu_activate(unsigned int cpu) @@ -8711,9 +8707,7 @@ int sched_cpu_deactivate(unsigned int cpu) */ sched_smt_present_dec(cpu); -#ifdef CONFIG_SCHED_SMT sched_core_cpu_deactivate(cpu); -#endif if (!sched_smp_initialized) return 0; diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7468560a6d80..2bcf58e99c9b 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -79,7 +79,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu) int node = scx_cpu_node_if_enabled(cpu); struct cpumask *idle_cpus = idle_cpumask(node)->cpu; -#ifdef CONFIG_SCHED_SMT /* * SMT mask should be cleared whether we can claim @cpu or not. The SMT * cluster is not wholly idle either way. This also prevents @@ -104,7 +103,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu) else if (cpumask_test_cpu(cpu, idle_smts)) __cpumask_clear_cpu(cpu, idle_smts); } -#endif return cpumask_test_and_clear_cpu(cpu, idle_cpus); } @@ -622,7 +620,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } -#ifdef CONFIG_SCHED_SMT /* * Use @prev_cpu's sibling if it's idle. */ @@ -634,7 +631,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } } -#endif /* * Search for any idle CPU in the same LLC domain. @@ -714,7 +710,6 @@ static void update_builtin_idle(int cpu, bool idle) assign_cpu(cpu, idle_cpus, idle); -#ifdef CONFIG_SCHED_SMT if (sched_smt_active()) { const struct cpumask *smt = cpu_smt_mask(cpu); struct cpumask *idle_smts = idle_cpumask(node)->smt; @@ -731,7 +726,6 @@ static void update_builtin_idle(int cpu, bool idle) cpumask_andnot(idle_smts, idle_smts, smt); } } -#endif } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 09d3acd2d2bc..233bd2ebbb73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1555,7 +1555,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline bool is_core_idle(int cpu) { -#ifdef CONFIG_SCHED_SMT int sibling; for_each_cpu(sibling, cpu_smt_mask(cpu)) { @@ -1565,7 +1564,6 @@ static inline bool is_core_idle(int cpu) if (!idle_cpu(sibling)) return false; } -#endif return true; } @@ -2248,7 +2246,6 @@ numa_type numa_classify(unsigned int imbalance_pct, return node_fully_busy; } -#ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) @@ -2266,12 +2263,6 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else /* !CONFIG_SCHED_SMT: */ -static inline int numa_idle_core(int idle_core, int cpu) -{ - return idle_core; -} -#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -7778,7 +7769,6 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p) return -1; } -#ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -7888,29 +7878,6 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* !CONFIG_SCHED_SMT: */ - -static inline void set_idle_cores(int cpu, int val) -{ -} - -static inline bool test_idle_cores(int cpu) -{ - return false; -} - -static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) -{ - return __select_idle_cpu(core, p); -} - -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - return -1; -} - -#endif /* !CONFIG_SCHED_SMT */ - /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the @@ -12002,9 +11969,7 @@ static int should_we_balance(struct lb_env *env) * idle has been found, then its not needed to check other * SMT siblings for idleness: */ -#ifdef CONFIG_SCHED_SMT cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); -#endif continue; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f63b15d309d..e476623a0c2a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1667,7 +1667,6 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0) -#ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) @@ -1676,12 +1675,7 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else /* !CONFIG_SCHED_SMT: */ -static inline void update_idle_core(struct rq *rq) { } -#endif /* !CONFIG_SCHED_SMT */ - #ifdef CONFIG_FAIR_GROUP_SCHED - static inline struct task_struct *task_of(struct sched_entity *se) { WARN_ON_ONCE(!entity_is_task(se)); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5847b83d9d55..a1f46e3f4ede 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1310,9 +1310,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) cpumask_copy(mask, sched_group_span(sg)); for_each_cpu(cpu, mask) { cores++; -#ifdef CONFIG_SCHED_SMT cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); -#endif } sg->cores = cores; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 3fe6b0c99f3d..773d8e9ae30c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -633,6 +633,11 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) EXPORT_SYMBOL_GPL(stop_machine); #ifdef CONFIG_SCHED_SMT +/* + * INTEL_IFS is the only user of this API. That selftest can + * only be compiled if SMP=y. On x86 it selects SCHED_SMT. + * Keep the ifdefs for now. + */ int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f747f241a5f..99ef412f02a6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -8187,11 +8187,7 @@ static bool __init cpus_dont_share(int cpu0, int cpu1) static bool __init cpus_share_smt(int cpu0, int cpu1) { -#ifdef CONFIG_SCHED_SMT return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); -#else - return false; -#endif } static bool __init cpus_share_numa(int cpu0, int cpu1) -- cgit v1.2.3 From 9e005ed21152d4a4bb0ceea71045ff8a642a6feb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 19 May 2026 05:14:23 +0000 Subject: sched/topology: Allow multiple domains to claim sched_domain_shared Recent optimizations of sd->shared assignment moved to allocating a single instance of per-CPU sched_domain_shared objects per s_data. Recent optimizations to select_idle_capacity() moved the sd->shared assignments to "sd_asym" domain when ASYM_CPUCAPACITY is detected but cache-aware scheduling mandates the presence of "sd_llc_shared" to compute and cache per-LLC statistics. Use an "alloc_flags" union in sched_domain_shared to claim a sched_domain_shared object per sched_domain. Allocation starts searching for an available / matching sched_domain_shared instance from the first CPU of sched_domain_span(sd) (sd can be sd_llc, or sd_asym). If the shared object is claimed by another domain, the instance corresponding to next CPU in the domain span is explored until a matching / available instance is found. In case of a single CPU in sched_domain_span(), the domain will be degenerated and a temporary overlap of ->shared objects across different domains is acceptable. "alloc_flags" forms a union with "nr_idle_scan" and the stale flags are left as is when the sd->shared is published. The expectation is for the first load balancing instance to correct the value just like the current behavior, except the initial value is no longer 0. Originally-by: Peter Zijlstra Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Andrea Righi --- include/linux/sched/topology.h | 16 ++++++++++- kernel/sched/topology.c | 63 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 69 insertions(+), 10 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index fe09d3268bc9..b5d9d7c2b8ad 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -67,7 +67,21 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; - int nr_idle_scan; + union { + int nr_idle_scan; + /* + * Used during allocation to claim the sched_domain_shared + * object at multiple levels. + * + * Note: between build and the first periodic LB tick, which + * rewrites the union via update_idle_cpu_scan(), readers of + * nr_idle_scan may observe the transient SD_* flag value as + * the scan bound. The flag bits are small positive integers, + * so the effect is just a slightly relaxed scan bound for one + * window and self-heals on the first tick. + */ + int alloc_flags; + }; #ifdef CONFIG_SCHED_CACHE unsigned long util_avg; unsigned long capacity; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dbfd9657f897..df2ceb54c970 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -623,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } +static void free_sched_domain_shared(struct sched_domain_shared *sds) +{ + if (sds && atomic_dec_and_test(&sds->ref)) + kfree(sds); +} + static void destroy_sched_domain(struct sched_domain *sd) { /* @@ -631,9 +637,7 @@ static void destroy_sched_domain(struct sched_domain *sd) * dropping group/capacity references, freeing where none remain. */ free_sched_groups(sd->groups, 1); - - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) - kfree(sd->shared); + free_sched_domain_shared(sd->shared); #ifdef CONFIG_SCHED_CACHE /* only the bottom sd has llc_counts array */ @@ -755,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Pick reference to parent->shared. */ if (parent->shared) { - WARN_ON_ONCE(tmp->shared); + /* + * It is safe to free a sd->shared that + * has not been published yet. If a + * sd->shared was published, the refcount + * will end up being non-zero and it will + * not be freed here. + */ + free_sched_domain_shared(tmp->shared); tmp->shared = parent->shared; parent->shared = NULL; } @@ -2916,11 +2927,45 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc) } } -static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd) +static void +init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags) { - int sd_id = cpumask_first(sched_domain_span(sd)); + struct sched_domain_shared *sds = NULL; + int cpu; + + /* + * Multiple domains can try to claim a shared object like + * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to + * same cpumask_first(sched_domain_span(sd)) CPU and can + * cause "nr_idle_scan" to be populated incorrectly during + * load balancing. + * + * Find the first CPU in sched_domain_span(sd) with an + * unclaimed domain (!alloc_flags) or where the alloc_flag + * matches the requested flag (SD_* flag) + * + * If the domain only has single CPU, allow temporary overlap + * in allocation since the domains will be degenerated later. + */ + for_each_cpu(cpu, sched_domain_span(sd)) { + sds = *per_cpu_ptr(d->sds, cpu); + + if (!sds->alloc_flags || + sd->span_weight == 1 || + sds->alloc_flags == flags) { + sds->alloc_flags = flags; + sd->shared = sds; + break; + } + } + + /* + * Use the sd_shared corresponding to the last + * CPU in the span if none are avaialable. + */ + if (WARN_ON_ONCE(!sd->shared)) + sd->shared = sds; - sd->shared = *per_cpu_ptr(d->sds, sd_id); /* * nr_busy_cpus is consumed only by the NOHZ kick path via * sd_balance_shared; on the asym-capacity path it is initialized but @@ -2960,7 +3005,7 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) if (!sd_asym || (sd_asym->flags & SD_NUMA)) return false; - init_sched_domain_shared(d, sd_asym); + init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY); return true; } @@ -3115,7 +3160,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = sd->parent; if (sd->flags & SD_SHARE_LLC) { - init_sched_domain_shared(&d, sd); + init_sched_domain_shared(&d, sd, SD_SHARE_LLC); /* * In presence of higher domains, adjust the -- cgit v1.2.3