summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2026-05-19 12:18:01 +0200
committerPeter Zijlstra <peterz@infradead.org>2026-05-19 12:18:01 +0200
commita26d9208c1376ac3877d9f12e697f83368e2af1c (patch)
tree1f68b9ae26b8b88a7d1f24fea2a6df87e4c0fee5
parentdd29c017aed628076e915fe4cdfb5392fd4c5cab (diff)
parentc99b8593b060931c5a0a4b701689f8d6a2c00dbf (diff)
downloadlinux-next-a26d9208c1376ac3877d9f12e697f83368e2af1c.tar.gz
linux-next-a26d9208c1376ac3877d9f12e697f83368e2af1c.zip
Merge branch 'sched/cache'
Merge the cache aware balancer topic branch. # Conflicts: # kernel/sched/topology.c
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt12
-rw-r--r--arch/x86/include/asm/processor.h5
-rw-r--r--arch/x86/kernel/smpboot.c20
-rw-r--r--drivers/base/cacheinfo.c23
-rw-r--r--include/linux/cacheinfo.h1
-rw-r--r--include/linux/mm_types.h32
-rw-r--r--include/linux/sched.h30
-rw-r--r--include/linux/sched/topology.h16
-rw-r--r--init/Kconfig11
-rw-r--r--init/init_task.c4
-rw-r--r--kernel/exit.c29
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/debug.c60
-rw-r--r--kernel/sched/fair.c1145
-rw-r--r--kernel/sched/sched.h51
-rw-r--r--kernel/sched/topology.c447
17 files changed, 1849 insertions, 56 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4d0f545fb3ec..f575d450861e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7236,6 +7236,18 @@ Kernel parameters
Not specifying this option is equivalent to
spec_store_bypass_disable=auto.
+ split_llc=
+ [X86,EARLY] Split the LLC N-ways
+
+ When set, the LLC is split this many ways by matching
+ 'core_id % n'. This is setup before SMP bringup and
+ used during SMP bringup before it knows the full
+ topology. If your core count doesn't nicely divide by
+ the number given, you get to keep the pieces.
+
+ This is mostly a debug feature to emulate multiple LLCs
+ on hardware that only have a single LLC.
+
split_lock_detect=
[X86] Enable split lock detection or bus lock detection
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 10b5355b323e..eb9fce94620d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -704,6 +704,11 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu)
return per_cpu(cpu_info.topo.l2c_id, cpu);
}
+static inline u32 per_cpu_core_id(unsigned int cpu)
+{
+ return per_cpu(cpu_info.topo.core_id, cpu);
+}
+
#ifdef CONFIG_CPU_SUP_AMD
/*
* Issue a DIV 0/1 insn to clear any division data from previous DIV
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 294a8ea60298..cb999feb66b0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -424,6 +424,21 @@ static const struct x86_cpu_id intel_cod_cpu[] = {
{}
};
+/*
+ * Allows splitting the LLC by matching 'core_id % split_llc'.
+ *
+ * This is mostly a debug hack to emulate systems with multiple LLCs per node
+ * on systems that do not naturally have this.
+ */
+static unsigned int split_llc = 0;
+
+static int __init split_llc_setup(char *str)
+{
+ get_option(&str, &split_llc);
+ return 0;
+}
+early_param("split_llc", split_llc_setup);
+
static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
@@ -438,6 +453,11 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
return false;
+ if (split_llc &&
+ (per_cpu_core_id(cpu1) % split_llc) !=
+ (per_cpu_core_id(cpu2) % split_llc))
+ return false;
+
/*
* Allow the SNC topology without warning. Return of false
* means 'c' does not share the LLC of 'o'. This will be
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index 391ac5e3d2f5..70701d3bc81c 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -17,6 +17,7 @@
#include <linux/init.h>
#include <linux/of.h>
#include <linux/sched.h>
+#include <linux/sched/topology.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/sysfs.h>
@@ -68,6 +69,24 @@ bool last_level_cache_is_valid(unsigned int cpu)
}
+/*
+ * Get the cacheinfo of the LLC associated with @cpu.
+ * Derived from update_per_cpu_data_slice_size_cpu().
+ */
+struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu)
+{
+ struct cacheinfo *llc;
+
+ if (!last_level_cache_is_valid(cpu))
+ return NULL;
+
+ llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
+ if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
+ return NULL;
+
+ return llc;
+}
+
bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y)
{
struct cacheinfo *llc_x, *llc_y;
@@ -1018,6 +1037,7 @@ static int cacheinfo_cpu_online(unsigned int cpu)
goto err;
if (cpu_map_shared_cache(true, cpu, &cpu_map))
update_per_cpu_data_slice_size(true, cpu, cpu_map);
+ sched_update_llc_bytes(cpu);
return 0;
err:
free_cache_attributes(cpu);
@@ -1036,6 +1056,9 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu)
free_cache_attributes(cpu);
if (nr_shared > 1)
update_per_cpu_data_slice_size(false, cpu, cpu_map);
+
+ sched_update_llc_bytes(cpu);
+
return 0;
}
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index c8f4f0a0b874..fc879ac4cc4f 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu);
int cache_setup_acpi(unsigned int cpu);
bool last_level_cache_is_valid(unsigned int cpu);
bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y);
+struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu);
int fetch_cache_info(unsigned int cpu);
int detect_cache_attributes(unsigned int cpu);
#ifndef CONFIG_ACPI_PPTT
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a308e2c23b82..c7db35be6a30 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1222,6 +1222,8 @@ struct mm_struct {
/* MM CID related storage */
struct mm_mm_cid mm_cid;
+ /* sched_cache related statistics */
+ struct sched_cache_stat sc_stat;
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
#endif
@@ -1628,6 +1630,36 @@ static inline unsigned int mm_cid_size(void)
# define MM_CID_STATIC_SIZE 0
#endif /* CONFIG_SCHED_MM_CID */
+#ifdef CONFIG_SCHED_CACHE
+void mm_init_sched(struct mm_struct *mm,
+ struct sched_cache_time __percpu *pcpu_sched);
+
+static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+{
+ struct sched_cache_time __percpu *pcpu_sched =
+ alloc_percpu_noprof(struct sched_cache_time);
+
+ if (!pcpu_sched)
+ return -ENOMEM;
+
+ mm_init_sched(mm, pcpu_sched);
+ return 0;
+}
+
+#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
+
+static inline void mm_destroy_sched(struct mm_struct *mm)
+{
+ free_percpu(mm->sc_stat.pcpu_sched);
+ mm->sc_stat.pcpu_sched = NULL;
+}
+#else /* !CONFIG_SCHED_CACHE */
+
+static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_sched(struct mm_struct *mm) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 368c7b4d7cb5..da6a0907a78c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1408,6 +1408,13 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_SCHED_CACHE
+ struct callback_head cache_work;
+ int preferred_llc;
+ /* 1: task was enqueued to its preferred LLC, 0 otherwise */
+ int pref_llc_queued;
+#endif
+
struct rseq_data rseq;
struct sched_mm_cid mm_cid;
@@ -2408,6 +2415,29 @@ static __always_inline int task_mm_cid(struct task_struct *t)
}
#endif
+#ifdef CONFIG_SCHED_CACHE
+
+struct sched_cache_time {
+ u64 runtime;
+ unsigned long epoch;
+};
+
+struct sched_cache_stat {
+ struct sched_cache_time __percpu *pcpu_sched;
+ raw_spinlock_t lock;
+ unsigned long epoch;
+ u64 nr_running_avg;
+ unsigned long next_scan;
+ unsigned long footprint;
+ int cpu;
+} ____cacheline_aligned_in_smp;
+
+#else
+
+struct sched_cache_stat { };
+
+#endif
+
#ifndef MODULE
#ifndef COMPILE_OFFSETS
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 36553e14866d..fe09d3268bc9 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -68,6 +68,10 @@ struct sched_domain_shared {
atomic_t nr_busy_cpus;
int has_idle_cores;
int nr_idle_scan;
+#ifdef CONFIG_SCHED_CACHE
+ unsigned long util_avg;
+ unsigned long capacity;
+#endif
};
struct sched_domain {
@@ -99,6 +103,12 @@ struct sched_domain {
u64 max_newidle_lb_cost;
unsigned long last_decay_max_lb_cost;
+#ifdef CONFIG_SCHED_CACHE
+ unsigned int llc_max;
+ unsigned int *llc_counts __counted_by_ptr(llc_max);
+ unsigned long llc_bytes;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
/* sched_balance_rq() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -256,4 +266,10 @@ static inline int task_node(const struct task_struct *p)
return cpu_to_node(task_cpu(p));
}
+#ifdef CONFIG_SCHED_CACHE
+extern void sched_update_llc_bytes(unsigned int cpu);
+#else
+static inline void sched_update_llc_bytes(unsigned int cpu) { }
+#endif
+
#endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/init/Kconfig b/init/Kconfig
index 2937c4d308ae..a52cd95ef171 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1019,6 +1019,17 @@ config NUMA_BALANCING
This system will be inactive on UMA systems.
+config SCHED_CACHE
+ bool "Cache aware load balance"
+ default y
+ depends on SMP
+ help
+ When enabled, the scheduler will attempt to aggregate tasks from
+ the same process onto a single Last Level Cache (LLC) domain when
+ possible. This improves cache locality by keeping tasks that share
+ resources within the same cache domain, reducing cache misses and
+ lowering data access latency.
+
config NUMA_BALANCING_DEFAULT_ENABLED
bool "Automatically enable NUMA aware memory/task placement"
default y
diff --git a/init/init_task.c b/init/init_task.c
index b5f48ebdc2b6..3ecd66fbd563 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -215,6 +215,10 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.numa_group = NULL,
.numa_faults = NULL,
#endif
+#ifdef CONFIG_SCHED_CACHE
+ .preferred_llc = -1,
+ .pref_llc_queued = 0,
+#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
.kasan_depth = 1,
#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 25e9cb6de7e7..c6e7047739b0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm)
}
#endif /* CONFIG_MEMCG */
+#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING)
+/*
+ * Subtract the memory footprint of the current task from
+ * mm.
+ */
+static void exit_mm_sched_cache(struct mm_struct *mm)
+{
+ unsigned long fp, sub;
+
+ if (!current->total_numa_faults)
+ return;
+ /*
+ * No lock protection due to performance considerations.
+ * Make sure mm->sc_stat.footprint does not become
+ * negative.
+ */
+ fp = READ_ONCE(mm->sc_stat.footprint);
+ sub = min(fp, current->total_numa_faults);
+ WRITE_ONCE(mm->sc_stat.footprint, fp - sub);
+}
+#else
+static inline void exit_mm_sched_cache(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */
+
/*
* Turn us into a lazy TLB process if we
* aren't already..
@@ -554,6 +580,9 @@ static void exit_mm(void)
exit_mm_release(current, mm);
if (!mm)
return;
+
+ exit_mm_sched_cache(mm);
+
mmap_read_lock(mm);
mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5f3fdfdb14c7..a679b2448234 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -726,6 +726,7 @@ void __mmdrop(struct mm_struct *mm)
cleanup_lazy_tlbs(mm);
WARN_ON_ONCE(mm == current->active_mm);
+ mm_destroy_sched(mm);
mm_free_pgd(mm);
mm_free_id(mm);
destroy_context(mm);
@@ -1128,6 +1129,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (mm_alloc_cid(mm, p))
goto fail_cid;
+ if (mm_alloc_sched(mm))
+ goto fail_sched;
+
if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
NR_MM_COUNTERS))
goto fail_pcpu;
@@ -1137,6 +1141,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
return mm;
fail_pcpu:
+ mm_destroy_sched(mm);
+fail_sched:
mm_destroy_cid(mm);
fail_cid:
destroy_context(mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 77f4ebe8f5c7..7fb3f5f2d48c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -548,6 +548,11 @@ void __trace_set_current_state(int state_value)
}
EXPORT_SYMBOL(__trace_set_current_state);
+int task_llc(const struct task_struct *p)
+{
+ return per_cpu(sd_llc_id, task_cpu(p));
+}
+
/*
* Serialization rules:
*
@@ -4506,6 +4511,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
+ init_sched_mm(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -8699,6 +8705,8 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
+ sched_domains_free_llc_id(cpu);
+
sched_set_rq_offline(rq, cpu);
scx_rq_deactivate(rq);
@@ -9030,6 +9038,11 @@ void __init sched_init(void)
rq->core_cookie = 0UL;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ raw_spin_lock_init(&rq->cpu_epoch_lock);
+ rq->cpu_epoch_next = jiffies;
+#endif
+
zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 74c1617cf652..ed3a0d65da0c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -210,6 +210,48 @@ static const struct file_operations sched_scaling_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CACHE
+static ssize_t
+sched_cache_enable_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ bool val;
+ int ret;
+
+ ret = kstrtobool_from_user(ubuf, cnt, &val);
+ if (ret)
+ return ret;
+
+ sysctl_sched_cache_user = val;
+
+ sched_cache_active_set();
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_cache_enable_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", sysctl_sched_cache_user);
+ return 0;
+}
+
+static int sched_cache_enable_open(struct inode *inode,
+ struct file *filp)
+{
+ return single_open(filp, sched_cache_enable_show, NULL);
+}
+
+static const struct file_operations sched_cache_enable_fops = {
+ .open = sched_cache_enable_open,
+ .write = sched_cache_enable_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
#ifdef CONFIG_PREEMPT_DYNAMIC
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
@@ -593,7 +635,7 @@ static void debugfs_ext_server_init(void)
static __init int sched_init_debug(void)
{
- struct dentry __maybe_unused *numa;
+ struct dentry __maybe_unused *numa, *llc;
debugfs_sched = debugfs_create_dir("sched", NULL);
@@ -626,6 +668,22 @@ static __init int sched_init_debug(void)
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_SCHED_CACHE
+ llc = debugfs_create_dir("llc_balancing", debugfs_sched);
+ debugfs_create_file("enabled", 0644, llc, NULL,
+ &sched_cache_enable_fops);
+ debugfs_create_u32("aggr_tolerance", 0644, llc,
+ &llc_aggr_tolerance);
+ debugfs_create_u32("epoch_period", 0644, llc,
+ &llc_epoch_period);
+ debugfs_create_u32("epoch_affinity_timeout", 0644, llc,
+ &llc_epoch_affinity_timeout);
+ debugfs_create_u32("overaggr_pct", 0644, llc,
+ &llc_overaggr_pct);
+ debugfs_create_u32("imb_pct", 0644, llc,
+ &llc_imb_pct);
+#endif
+
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69ba882681c5..8e858ca6bcd0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1321,6 +1321,8 @@ void post_init_entity_util_avg(struct task_struct *p)
sa->runnable_avg = sa->util_avg;
}
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
+
static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
@@ -1343,6 +1345,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
trace_sched_stat_runtime(running, delta_exec);
account_group_exec_runtime(running, delta_exec);
+ account_mm_sched(rq, running, delta_exec);
/* cgroup time is always accounted against the donor */
cgroup_account_cputime(donor, delta_exec);
@@ -1364,6 +1367,581 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
static void set_next_buddy(struct sched_entity *se);
+#ifdef CONFIG_SCHED_CACHE
+
+/*
+ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
+ * tunable or so.
+ */
+#define EPOCH_PERIOD (HZ / 100) /* 10 ms */
+#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */
+__read_mostly unsigned int llc_aggr_tolerance = 1;
+__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
+__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_imb_pct = 20;
+__read_mostly unsigned int llc_overaggr_pct = 50;
+
+static int llc_id(int cpu)
+{
+ if (cpu < 0)
+ return -1;
+
+ return per_cpu(sd_llc_id, cpu);
+}
+
+static inline int get_sched_cache_scale(int mul)
+{
+ unsigned int tol = READ_ONCE(llc_aggr_tolerance);
+
+ if (!tol)
+ return 0;
+
+ if (tol >= 100)
+ return INT_MAX;
+
+ return (1 + (tol - 1) * mul);
+}
+
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned long llc, footprint;
+ struct sched_domain *sd;
+ int scale;
+
+ guard(rcu)();
+
+ sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd);
+ if (!sd)
+ return true;
+
+ if (static_branch_likely(&sched_numa_balancing)) {
+ /*
+ * TBD: RDT exclusive LLC ways reserved should be
+ * excluded.
+ */
+ llc = sd->llc_bytes;
+ footprint = READ_ONCE(mm->sc_stat.footprint);
+
+ /*
+ * Scale the LLC size by 256*llc_aggr_tolerance
+ * and compare it to the task's footprint.
+ *
+ * Suppose the L3 size is 32MB. If the
+ * llc_aggr_tolerance is 1:
+ * When the footprint is larger than 32MB, the
+ * process is regarded as exceeding the LLC
+ * capacity. If the llc_aggr_tolerance is 99:
+ * When the footprint is larger than 784GB, the
+ * process is regarded as exceeding the LLC
+ * capacity:
+ * 784GB = (1 + (99 - 1) * 256) * 32MB
+ * If the llc_aggr_tolerance is 100:
+ * ignore the footprint and do the aggregation
+ * anyway.
+ */
+ scale = get_sched_cache_scale(256);
+ if (scale == INT_MAX)
+ return false;
+
+ return ((llc * (u64)scale) < (footprint * PAGE_SIZE));
+ }
+#endif
+ return false;
+}
+
+static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
+ int cpu)
+{
+ int scale;
+
+ if (get_nr_threads(p) <= 1)
+ return true;
+
+ /*
+ * Scale the number of 'cores' in a LLC by llc_aggr_tolerance
+ * and compare it to the task's active threads.
+ */
+ scale = get_sched_cache_scale(1);
+ if (scale == INT_MAX)
+ return false;
+
+ return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
+ (scale * per_cpu(sd_llc_size, cpu)));
+}
+
+static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+{
+ int pref_llc, pref_llc_queued;
+ struct sched_domain *sd;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ pref_llc_queued = (pref_llc == task_llc(p));
+ rq->nr_llc_running++;
+ rq->nr_pref_llc_running += pref_llc_queued;
+
+ /*
+ * Record whether p is enqueued on its preferred
+ * LLC, in order to pair with account_llc_dequeue()
+ * to maintain a consistent nr_pref_llc_running per
+ * runqueue.
+ * This is necessary because a race condition exists:
+ * after a task is enqueued on a runqueue, task_llc(p)
+ * may change due to CPU hotplug. Therefore, checking
+ * task_llc(p) to determine whether the task is being
+ * dequeued from its preferred LLC is unreliable and
+ * can cause inconsistent values - checking the
+ * p->pref_llc_queued in account_llc_dequeue() would
+ * be reliable.
+ */
+ p->pref_llc_queued = pref_llc_queued;
+
+ sd = rcu_dereference_all(rq->sd);
+ if (sd && (unsigned int)pref_llc < sd->llc_max)
+ sd->llc_counts[pref_llc]++;
+}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+{
+ struct sched_domain *sd;
+ int pref_llc;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ rq->nr_llc_running--;
+ if (p->pref_llc_queued) {
+ rq->nr_pref_llc_running--;
+ /*
+ * Update the status in case
+ * other logic might query
+ * this.
+ */
+ p->pref_llc_queued = 0;
+ }
+
+ sd = rcu_dereference_all(rq->sd);
+ if (sd && (unsigned int)pref_llc < sd->llc_max) {
+ /*
+ * There is a race condition between dequeue
+ * and CPU hotplug. After a task has been enqueued
+ * on CPUx, a CPU hotplug event occurs, and all online
+ * CPUs (including CPUx) rebuild their sched_domains
+ * and reset statistics to zero(including sd->llc_counts).
+ * This can cause temporary undercount and we have to
+ * check for such underflow in sd->llc_counts.
+ *
+ * This undercount is temporary and accurate accounting
+ * will resume once the rq has a chance to be idle.
+ */
+ if (sd->llc_counts[pref_llc])
+ sd->llc_counts[pref_llc]--;
+ }
+}
+
+void mm_init_sched(struct mm_struct *mm,
+ struct sched_cache_time __percpu *_pcpu_sched)
+{
+ unsigned long epoch = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+ struct rq *rq = cpu_rq(i);
+
+ pcpu_sched->runtime = 0;
+ /* a slightly stale cpu epoch is acceptible */
+ pcpu_sched->epoch = rq->cpu_epoch;
+ epoch = rq->cpu_epoch;
+ }
+
+ raw_spin_lock_init(&mm->sc_stat.lock);
+ mm->sc_stat.epoch = epoch;
+ mm->sc_stat.cpu = -1;
+ mm->sc_stat.next_scan = jiffies;
+ mm->sc_stat.nr_running_avg = 0;
+ mm->sc_stat.footprint = 0;
+ /*
+ * The update to mm->sc_stat should not be reordered
+ * before initialization to mm's other fields, in case
+ * the readers may get invalid mm_sched_epoch, etc.
+ */
+ smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched);
+}
+
+/* because why would C be fully specified */
+static __always_inline void __shr_u64(u64 *val, unsigned int n)
+{
+ if (n >= 64) {
+ *val = 0;
+ return;
+ }
+ *val >>= n;
+}
+
+static inline void __update_mm_sched(struct rq *rq,
+ struct sched_cache_time *pcpu_sched)
+{
+ lockdep_assert_held(&rq->cpu_epoch_lock);
+
+ unsigned int period = max(READ_ONCE(llc_epoch_period), 1U);
+ unsigned long n, now = jiffies;
+ long delta = now - rq->cpu_epoch_next;
+
+ if (delta > 0) {
+ n = (delta + period - 1) / period;
+ rq->cpu_epoch += n;
+ rq->cpu_epoch_next += n * period;
+ __shr_u64(&rq->cpu_runtime, n);
+ }
+
+ n = rq->cpu_epoch - pcpu_sched->epoch;
+ if (n) {
+ pcpu_sched->epoch += n;
+ __shr_u64(&pcpu_sched->runtime, n);
+ }
+}
+
+static unsigned long fraction_mm_sched(struct rq *rq,
+ struct sched_cache_time *pcpu_sched)
+{
+ guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+
+ __update_mm_sched(rq, pcpu_sched);
+
+ /*
+ * Runtime is a geometric series (r=0.5) and as such will sum to twice
+ * the accumulation period, this means the multiplcation here should
+ * not overflow.
+ */
+ return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+}
+
+static int get_pref_llc(struct task_struct *p, struct mm_struct *mm)
+{
+ int mm_sched_llc = -1, mm_sched_cpu;
+
+ if (!mm)
+ return -1;
+
+ mm_sched_cpu = READ_ONCE(mm->sc_stat.cpu);
+ if (mm_sched_cpu != -1) {
+ mm_sched_llc = llc_id(mm_sched_cpu);
+
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Don't assign preferred LLC if it
+ * conflicts with NUMA balancing.
+ * This can happen when sched_setnuma() gets
+ * called, however it is not much of an issue
+ * because we expect account_mm_sched() to get
+ * called fairly regularly -- at a higher rate
+ * than sched_setnuma() at least -- and thus the
+ * conflict only exists for a short period of time.
+ */
+ if (static_branch_likely(&sched_numa_balancing) &&
+ p->numa_preferred_nid >= 0 &&
+ cpu_to_node(mm_sched_cpu) != p->numa_preferred_nid)
+ mm_sched_llc = -1;
+#endif
+ }
+
+ return mm_sched_llc;
+}
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+ struct sched_cache_time *pcpu_sched;
+ struct mm_struct *mm = p->mm;
+ int mm_sched_llc = -1;
+ unsigned long epoch;
+
+ if (!sched_cache_enabled())
+ return;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+ /*
+ * init_task, kthreads and user thread created
+ * by user_mode_thread() don't have mm.
+ */
+ if (!mm || !mm->sc_stat.pcpu_sched)
+ return;
+
+ pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu_of(rq));
+
+ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+ __update_mm_sched(rq, pcpu_sched);
+ pcpu_sched->runtime += delta_exec;
+ rq->cpu_runtime += delta_exec;
+ epoch = rq->cpu_epoch;
+ }
+
+ /*
+ * If this process hasn't hit task_cache_work() for a while invalidate
+ * its preferred state.
+ */
+ if ((long)(epoch - READ_ONCE(mm->sc_stat.epoch)) > llc_epoch_affinity_timeout ||
+ invalid_llc_nr(mm, p, cpu_of(rq)) ||
+ exceed_llc_capacity(mm, cpu_of(rq))) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+ }
+
+ mm_sched_llc = get_pref_llc(p, mm);
+
+ /* task not on rq accounted later in account_entity_enqueue() */
+ if (task_running_on_cpu(rq->cpu, p) &&
+ READ_ONCE(p->preferred_llc) != mm_sched_llc) {
+ account_llc_dequeue(rq, p);
+ WRITE_ONCE(p->preferred_llc, mm_sched_llc);
+ account_llc_enqueue(rq, p);
+ }
+}
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p)
+{
+ struct callback_head *work = &p->cache_work;
+ struct mm_struct *mm = p->mm;
+ unsigned long epoch;
+
+ if (!sched_cache_enabled())
+ return;
+
+ if (!mm || p->flags & PF_KTHREAD ||
+ !mm->sc_stat.pcpu_sched)
+ return;
+
+ epoch = rq->cpu_epoch;
+ /* avoid moving backwards */
+ if (time_after_eq(mm->sc_stat.epoch, epoch))
+ return;
+
+ guard(raw_spinlock)(&mm->sc_stat.lock);
+
+ if (work->next == work) {
+ task_work_add(p, work, TWA_RESUME);
+ WRITE_ONCE(mm->sc_stat.epoch, epoch);
+ }
+}
+
+static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ int cpu, curr_cpu, nid, pref_nid;
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ goto out;
+
+ cpu = READ_ONCE(p->mm->sc_stat.cpu);
+ if (cpu != -1)
+ nid = cpu_to_node(cpu);
+ curr_cpu = task_cpu(p);
+
+ /*
+ * Scanning in the preferred NUMA node is ideal. However, the NUMA
+ * preferred node is per-task rather than per-process. It is possible
+ * for different threads of the process to have distinct preferred
+ * nodes; consequently, the process-wide preferred LLC may bounce
+ * between different nodes. As a workaround, maintain the scan
+ * CPU mask to also cover the process's current preferred LLC and the
+ * current running node to mitigate the bouncing risk.
+ * TBD: numa_group should be considered during task aggregation.
+ */
+ pref_nid = p->numa_preferred_nid;
+ /* honor the task's preferred node */
+ if (pref_nid == NUMA_NO_NODE)
+ goto out;
+
+ cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
+
+ /* honor the task's preferred LLC CPU */
+ if (cpu != -1 && !cpumask_test_cpu(cpu, cpus) && nid != NUMA_NO_NODE)
+ cpumask_or(cpus, cpus, cpumask_of_node(nid));
+
+ /* make sure the task's current running node is included */
+ if (!cpumask_test_cpu(curr_cpu, cpus))
+ cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
+
+ return;
+
+out:
+#endif
+ cpumask_copy(cpus, cpu_online_mask);
+}
+
+static inline void update_avg_scale(u64 *avg, u64 sample)
+{
+ int factor = per_cpu(sd_llc_size, raw_smp_processor_id());
+ s64 diff = sample - *avg;
+ u32 divisor;
+
+ /*
+ * Scale the divisor based on the number of CPUs contained
+ * in the LLC. This scaling ensures smaller LLC domains use
+ * a smaller divisor to achieve more precise sensitivity to
+ * changes in nr_running, while larger LLC domains are capped
+ * at a maximum divisor of 8 which is the default smoothing
+ * factor of EWMA in update_avg().
+ */
+ divisor = clamp_t(u32, (factor >> 2), 2, 8);
+ *avg += div64_s64(diff, divisor);
+}
+
+static void task_cache_work(struct callback_head *work)
+{
+ int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
+ unsigned long next_scan, now = jiffies;
+ struct task_struct *p = current, *cur;
+ unsigned long curr_m_a_occ = 0;
+ struct mm_struct *mm = p->mm;
+ unsigned long m_a_occ = 0;
+ cpumask_var_t cpus;
+
+ WARN_ON_ONCE(work != &p->cache_work);
+
+ work->next = work;
+
+ if (p->flags & PF_EXITING)
+ return;
+
+ next_scan = READ_ONCE(mm->sc_stat.next_scan);
+ if (time_before(now, next_scan))
+ return;
+
+ /* only 1 thread is allowed to scan */
+ if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan,
+ now + max_t(unsigned long,
+ READ_ONCE(llc_epoch_period), 1)))
+ return;
+
+ curr_cpu = task_cpu(p);
+ if (invalid_llc_nr(mm, p, curr_cpu) ||
+ exceed_llc_capacity(mm, curr_cpu)) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+
+ return;
+ }
+
+ if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ return;
+
+ scoped_guard (cpus_read_lock) {
+ guard(rcu)();
+
+ get_scan_cpumasks(cpus, p);
+
+ for_each_cpu(cpu, cpus) {
+ /* XXX sched_cluster_active */
+ struct sched_domain *sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
+ unsigned long occ, m_occ = 0, a_occ = 0;
+ int m_cpu = -1, i;
+
+ if (!sd)
+ continue;
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ occ = fraction_mm_sched(cpu_rq(i),
+ per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ a_occ += occ;
+ if (occ > m_occ) {
+ m_occ = occ;
+ m_cpu = i;
+ }
+
+ cur = rcu_dereference_all(cpu_rq(i)->curr);
+ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
+ cur->mm == mm)
+ nr_running++;
+ }
+
+ /*
+ * Compare the accumulated occupancy of each LLC. The
+ * reason for using accumulated occupancy rather than average
+ * per CPU occupancy is that it works better in asymmetric LLC
+ * scenarios.
+ * For example, if there are 2 threads in a 4CPU LLC and 3
+ * threads in an 8CPU LLC, it might be better to choose the one
+ * with 3 threads. However, this would not be the case if the
+ * occupancy is divided by the number of CPUs in an LLC (i.e.,
+ * if average per CPU occupancy is used).
+ * Besides, NUMA balancing fault statistics behave similarly:
+ * the total number of faults per node is compared rather than
+ * the average number of faults per CPU. This strategy is also
+ * followed here.
+ */
+ if (a_occ > m_a_occ) {
+ m_a_occ = a_occ;
+ m_a_cpu = m_cpu;
+ }
+
+ if (llc_id(cpu) == llc_id(READ_ONCE(mm->sc_stat.cpu)))
+ curr_m_a_occ = a_occ;
+
+ cpumask_andnot(cpus, cpus, sched_domain_span(sd));
+ }
+ }
+
+ if (m_a_occ > (2 * curr_m_a_occ)) {
+ /*
+ * Avoid switching sc_stat.cpu too fast.
+ * The reason to choose 2X is because:
+ * 1. It is better to keep the preferred LLC stable,
+ * rather than changing it frequently and cause migrations
+ * 2. 2X means the new preferred LLC has at least 1 more
+ * busy CPU than the old one(200% vs 100%, eg)
+ * 3. 2X is chosen based on test results, as it delivers
+ * the optimal performance gain so far.
+ */
+ WRITE_ONCE(mm->sc_stat.cpu, m_a_cpu);
+ }
+
+ update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
+ free_cpumask_var(cpus);
+}
+
+void init_sched_mm(struct task_struct *p)
+{
+ struct callback_head *work = &p->cache_work;
+
+ init_task_work(work, task_cache_work);
+ work->next = work;
+ /*
+ * Reset new task's preference to avoid
+ * polluting account_llc_enqueue().
+ */
+ p->preferred_llc = -1;
+}
+
+#else /* CONFIG_SCHED_CACHE */
+
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+ s64 delta_exec) { }
+
+void init_sched_mm(struct task_struct *p) { }
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+
+static inline int get_pref_llc(struct task_struct *p,
+ struct mm_struct *mm)
+{
+ return -1;
+}
+
+static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
+
+#endif /* CONFIG_SCHED_CACHE */
+
/*
* Used by other classes to account runtime.
*/
@@ -3038,6 +3616,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ long __maybe_unused new_fp;
struct numa_group *ng;
/*
@@ -3112,6 +3691,31 @@ static void task_numa_placement(struct task_struct *p)
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
+#ifdef CONFIG_SCHED_CACHE
+ /*
+ * Per task p->numa_faults[mem_idx] converges,
+ * so the accumulation of each task's faults
+ * converges too - Given the number of threads,
+ * it cannot overflow an unsigned long.
+ * Racy with concurrent updates from other threads
+ * sharing this mm. Acceptable since footprint is a
+ * heuristic and occasional lost updates are tolerable.
+ *
+ * If a task exits, its corresponding footprint must
+ * be subtracted from the mm->sc_stat.footprint, otherwise
+ * the mm->sc_stat.footprint will not converge:
+ * the exiting thread's footprint remains unchanged/undecayed
+ * in mm->sc_stat.footprint. See exit_mm().
+ *
+ * Lost updates and unsynchronized subtraction
+ * in exit_mm() can cause footprint + diff to
+ * go negative. Clamp to zero to prevent the
+ * unsigned footprint from wrapping.
+ */
+ new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff;
+ WRITE_ONCE(p->mm->sc_stat.footprint,
+ max(new_fp, 0L));
+#endif
}
if (!ng) {
@@ -3836,9 +4440,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
struct rq *rq = rq_of(cfs_rq);
- account_numa_enqueue(rq, task_of(se));
+ account_numa_enqueue(rq, p);
+ account_llc_enqueue(rq, p);
list_add(&se->group_node, &rq->cfs_tasks);
}
cfs_rq->nr_queued++;
@@ -3849,7 +4455,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
- account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ struct task_struct *p = task_of(se);
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_dequeue(rq, p);
+ account_llc_dequeue(rq, p);
list_del_init(&se->group_node);
}
cfs_rq->nr_queued--;
@@ -9617,6 +10227,16 @@ enum group_type {
*/
group_imbalanced,
/*
+ * There are tasks running on non-preferred LLC, possible to move
+ * them to their preferred LLC without creating too much imbalance.
+ * The priority of group_llc_balance is lower than that of
+ * group_overloaded and higher than that of all other group types.
+ * This is because group_llc_balance may exacerbate load imbalance.
+ * If the LLC balancing attempt fails, the nr_balance_failed
+ * mechanism will trigger other group types to rebalance the load.
+ */
+ group_llc_balance,
+ /*
* The CPU is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
@@ -9627,7 +10247,8 @@ enum migration_type {
migrate_load = 0,
migrate_util,
migrate_task,
- migrate_misfit
+ migrate_misfit,
+ migrate_llc_task
};
#define LBF_ALL_PINNED 0x01
@@ -9635,6 +10256,7 @@ enum migration_type {
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_ACTIVE_LB 0x10
+#define LBF_LLC_PINNED 0x20
struct lb_env {
struct sched_domain *sd;
@@ -9792,6 +10414,298 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
return 0;
}
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * The margin used when comparing LLC utilization with CPU capacity.
+ * It determines the LLC load level where active LLC aggregation is
+ * done.
+ * Derived from fits_capacity().
+ *
+ * (default: ~50%, tunable via debugfs)
+ */
+static bool fits_llc_capacity(unsigned long util, unsigned long max)
+{
+ u32 aggr_pct = llc_overaggr_pct;
+
+ /*
+ * For single core systems, raise the aggregation
+ * threshold to accommodate more tasks.
+ */
+ if (cpu_smt_num_threads == 1)
+ aggr_pct = (aggr_pct * 3 / 2);
+
+ return util * 100 < max * aggr_pct;
+}
+
+/*
+ * The margin used when comparing utilization.
+ * is 'util1' noticeably greater than 'util2'
+ * Derived from capacity_greater().
+ * Bias is in perentage.
+ */
+/* Allows dst util to be bigger than src util by up to bias percent */
+#define util_greater(util1, util2) \
+ ((util1) * 100 > (util2) * (100 + llc_imb_pct))
+
+static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ unsigned long *cap)
+{
+ struct sched_domain_shared *sd_share;
+
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ if (!sd_share)
+ return false;
+
+ *util = READ_ONCE(sd_share->util_avg);
+ *cap = READ_ONCE(sd_share->capacity);
+
+ return true;
+}
+
+/*
+ * Decision matrix according to the LLC utilization. To
+ * decide whether we can do task aggregation across LLC.
+ *
+ * By default, 50% is the threshold for treating the LLC
+ * as busy. The reason for choosing 50% is to avoid saturation
+ * of SMT-2, and it is also a safe cutoff for other SMT-n
+ * platforms. SMT-1 has higher threshold because it is
+ * supposed to accommodate more tasks, see fits_llc_capacity().
+ *
+ * 20% is the utilization imbalance percentage to decide
+ * if the preferred LLC is busier than the non-preferred LLC.
+ * 20 is a little higher than the LLC domain's imbalance_pct
+ * 17. The hysteresis is used to avoid task bouncing between the
+ * preferred LLC and the non-preferred LLC, and it will
+ * be turned into tunable debugfs.
+ *
+ * 1. moving towards the preferred LLC, dst is the preferred
+ * LLC, src is not.
+ *
+ * src \ dst 30% 40% 50% 60%
+ * 30% Y Y Y N
+ * 40% Y Y Y Y
+ * 50% Y Y G G
+ * 60% Y Y G G
+ *
+ * 2. moving out of the preferred LLC, src is the preferred
+ * LLC, dst is not:
+ *
+ * src \ dst 30% 40% 50% 60%
+ * 30% N N N N
+ * 40% N N N N
+ * 50% N N G G
+ * 60% Y N G G
+ *
+ * src : src_util
+ * dst : dst_util
+ * Y : Yes, migrate
+ * N : No, do not migrate
+ * G : let the Generic load balance to even the load.
+ *
+ * The intention is that if both LLCs are quite busy, cache aware
+ * load balance should not be performed, and generic load balance
+ * should take effect. However, if one is busy and the other is not,
+ * the preferred LLC capacity(50%) and imbalance criteria(20%) should
+ * be considered to determine whether LLC aggregation should be
+ * performed to bias the load towards the preferred LLC.
+ */
+
+/* migration decision, 3 states are orthogonal. */
+enum llc_mig {
+ mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */
+ mig_llc, /* Y: Do LLC preference based migration */
+ mig_unrestricted /* G: Don't restrict generic load balance migration */
+};
+
+/*
+ * Check if task can be moved from the source LLC to the
+ * destination LLC without breaking cache aware preferrence.
+ * src_cpu and dst_cpu are arbitrary CPUs within the source
+ * and destination LLCs, respectively.
+ */
+static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
+ unsigned long tsk_util,
+ bool to_pref)
+{
+ unsigned long src_util, dst_util, src_cap, dst_cap;
+
+ if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
+ !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
+ return mig_unrestricted;
+
+ src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
+ dst_util = dst_util + tsk_util;
+
+ if (!fits_llc_capacity(dst_util, dst_cap) &&
+ !fits_llc_capacity(src_util, src_cap))
+ return mig_unrestricted;
+
+ if (to_pref) {
+ /*
+ * Don't migrate if we will get preferred LLC too
+ * heavily loaded and if the dest is much busier
+ * than the src, in which case migration will
+ * increase the imbalance too much.
+ */
+ if (!fits_llc_capacity(dst_util, dst_cap) &&
+ util_greater(dst_util, src_util))
+ return mig_forbid;
+ } else {
+ /*
+ * Don't migrate if we will leave preferred LLC
+ * too idle, or if this migration leads to the
+ * non-preferred LLC falls within sysctl_aggr_imb percent
+ * of preferred LLC, leading to migration again
+ * back to preferred LLC.
+ */
+ if (fits_llc_capacity(src_util, src_cap) ||
+ !util_greater(src_util, dst_util))
+ return mig_forbid;
+ }
+ return mig_llc;
+}
+
+/*
+ * Check if task p can migrate from source LLC to
+ * destination LLC in terms of cache aware load balance.
+ */
+static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ bool to_pref;
+ int cpu;
+
+ mm = p->mm;
+ if (!mm)
+ return mig_unrestricted;
+
+ cpu = READ_ONCE(mm->sc_stat.cpu);
+ if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ return mig_unrestricted;
+
+ /* skip cache aware load balance for too many threads */
+ if (invalid_llc_nr(mm, p, dst_cpu) ||
+ exceed_llc_capacity(mm, dst_cpu)) {
+ if (READ_ONCE(mm->sc_stat.cpu) != -1)
+ WRITE_ONCE(mm->sc_stat.cpu, -1);
+ return mig_unrestricted;
+ }
+
+ if (cpus_share_cache(dst_cpu, cpu))
+ to_pref = true;
+ else if (cpus_share_cache(src_cpu, cpu))
+ to_pref = false;
+ else
+ return mig_unrestricted;
+
+ return can_migrate_llc(src_cpu, dst_cpu,
+ task_util(p), to_pref);
+}
+
+/*
+ * Check if active load balance breaks LLC locality in
+ * terms of cache aware load balance. The load level and
+ * imbalance do not warrant breaking LLC preference per
+ * the can_migrate_llc() policy. Here, the benefit of
+ * LLC locality outweighs the power efficiency gained from
+ * migrating the only runnable task away.
+ */
+static inline bool
+alb_break_llc(struct lb_env *env)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (cpus_share_cache(env->src_cpu, env->dst_cpu))
+ return false;
+ /*
+ * All tasks prefer to stay on their current CPU.
+ * Do not pull a task from its preferred CPU if:
+ * 1. It is the only task running and does not exceed
+ * imbalance allowance; OR
+ * 2. Migrating it away from its preferred LLC would violate
+ * the cache-aware scheduling policy.
+ */
+ if (env->src_rq->nr_pref_llc_running &&
+ env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) {
+ unsigned long util = 0;
+ struct task_struct *cur;
+
+ if (env->src_rq->nr_running <= 1)
+ return true;
+
+ cur = rcu_dereference_all(env->src_rq->curr);
+ if (cur && cur->sched_class == &fair_sched_class)
+ util = task_util(cur);
+
+ if (can_migrate_llc(env->src_cpu, env->dst_cpu,
+ util, false) == mig_forbid)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check if migrating task p from env->src_cpu to
+ * env->dst_cpu breaks LLC localiy.
+ */
+static bool migrate_degrades_llc(struct task_struct *p, struct lb_env *env)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (task_has_sched_core(p))
+ return false;
+ /*
+ * Skip over tasks that would degrade LLC locality;
+ * only when nr_balanced_failed is sufficiently high do we
+ * ignore this constraint.
+ *
+ * Threshold of cache_nice_tries is set to 1 higher
+ * than nr_balance_failed to avoid excessive task
+ * migration at the same time.
+ */
+ if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1)
+ return false;
+
+ /*
+ * We know the env->src_cpu has some tasks prefer to
+ * run on env->dst_cpu, skip the tasks do not prefer
+ * env->dst_cpu, and find the one that prefers.
+ */
+ if (env->migration_type == migrate_llc_task &&
+ READ_ONCE(p->preferred_llc) != llc_id(env->dst_cpu))
+ return true;
+
+ if (can_migrate_llc_task(env->src_cpu,
+ env->dst_cpu, p) != mig_forbid)
+ return false;
+
+ return true;
+}
+
+#else
+static inline bool get_llc_stats(int cpu, unsigned long *util,
+ unsigned long *cap)
+{
+ return false;
+}
+
+static inline bool
+alb_break_llc(struct lb_env *env)
+{
+ return false;
+}
+
+static inline bool
+migrate_degrades_llc(struct task_struct *p, struct lb_env *env)
+{
+ return false;
+}
+#endif
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
@@ -9888,10 +10802,29 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 1;
degrades = migrate_degrades_locality(p, env);
- if (!degrades)
+ if (!degrades) {
+ /*
+ * If the NUMA locality is not broken,
+ * further check if migration would hurt
+ * LLC locality.
+ */
+ if (migrate_degrades_llc(p, env)) {
+ /*
+ * If regular load balancing fails to pull a task
+ * due to LLC locality, this is expected behavior
+ * and we set LBF_LLC_PINNED so we don't increase
+ * nr_balance_failed unecessarily.
+ */
+ if (env->migration_type != migrate_llc_task)
+ env->flags |= LBF_LLC_PINNED;
+
+ return 0;
+ }
+
hot = task_hot(p, env);
- else
+ } else {
hot = degrades > 0;
+ }
if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (hot)
@@ -10053,6 +10986,10 @@ static int detach_tasks(struct lb_env *env)
env->imbalance = 0;
break;
+
+ case migrate_llc_task:
+ env->imbalance--;
+ break;
}
detach_task(p, env);
@@ -10331,12 +11268,16 @@ struct sg_lb_stats {
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ unsigned int nr_pref_dst_llc;
+#endif
};
/*
@@ -10594,6 +11535,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
+ if (sgs->group_llc_balance)
+ return group_llc_balance;
+
if (sg_imbalanced(group))
return group_imbalanced;
@@ -10748,6 +11692,105 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
return check_cpu_capacity(rq, sd);
}
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * Record the statistics for this scheduler group for later
+ * use. These values guide load balancing on aggregating tasks
+ * to a LLC.
+ */
+static void record_sg_llc_stats(struct lb_env *env,
+ struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ struct sched_domain_shared *sd_share;
+ int cpu;
+
+ if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ /* Only care about sched domain spanning multiple LLCs */
+ if (env->sd->child != rcu_dereference_all(per_cpu(sd_llc, env->dst_cpu)))
+ return;
+
+ /*
+ * At this point we know this group spans a LLC domain.
+ * Record the statistic of this group in its corresponding
+ * shared LLC domain.
+ * Note: sd_share cannot be obtained via sd->child->shared,
+ * because the latter refers to the domain that covers the
+ * local group. Instead, sd_share should be located using
+ * the first CPU of the LLC group.
+ */
+ cpu = cpumask_first(sched_group_span(group));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ if (!sd_share)
+ return;
+
+ if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
+ WRITE_ONCE(sd_share->util_avg, sgs->group_util);
+
+ if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
+ WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
+}
+
+/*
+ * Do LLC balance on sched group that contains LLC, and have tasks preferring
+ * to run on LLC in idle dst_cpu.
+ */
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (!sched_cache_enabled())
+ return false;
+
+ if (env->sd->flags & SD_SHARE_LLC)
+ return false;
+
+ /*
+ * Skip cache aware tagging if nr_balanced_failed is sufficiently high.
+ * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed
+ * to avoid excessive task migration at the same time.
+ */
+ if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1)
+ return false;
+
+ if (sgs->nr_pref_dst_llc &&
+ can_migrate_llc(cpumask_first(sched_group_span(group)),
+ env->dst_cpu, 0, true) == mig_llc)
+ return true;
+
+ return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *sgs)
+{
+ /*
+ * There are more tasks that want to run on dst_cpu's LLC.
+ */
+ return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc;
+}
+#else
+static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+}
+
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *sgs)
+{
+ return false;
+}
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -10784,6 +11827,20 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (cpu_overutilized(i))
sgs->group_overutilized = 1;
+#ifdef CONFIG_SCHED_CACHE
+ if (sched_cache_enabled()) {
+ struct sched_domain *sd_tmp;
+ int dst_llc;
+
+ dst_llc = llc_id(env->dst_cpu);
+ if (llc_id(i) != dst_llc) {
+ sd_tmp = rcu_dereference_all(rq->sd);
+ if (sd_tmp && (unsigned int)dst_llc < sd_tmp->llc_max)
+ sgs->nr_pref_dst_llc += sd_tmp->llc_counts[dst_llc];
+ }
+ }
+#endif
+
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -10824,17 +11881,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
- /* Check if dst CPU is idle and preferred to this group */
- if (!local_group && env->idle && sgs->sum_h_nr_running &&
- sched_group_asym(env, sgs, group))
- sgs->group_asym_packing = 1;
+ if (!local_group) {
+ /* Check if dst CPU is idle and preferred to this group */
+ if (env->idle && sgs->sum_h_nr_running &&
+ sched_group_asym(env, sgs, group))
+ sgs->group_asym_packing = 1;
+
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
- /* Check for loaded SMT group to be balanced to dst CPU */
- if (!local_group && smt_balance(env, sgs, group))
- sgs->group_smt_balance = 1;
+ /* Check for tasks in this group can be moved to their preferred LLC */
+ if (llc_balance(env, sgs, group))
+ sgs->group_llc_balance = 1;
+ }
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ record_sg_llc_stats(env, sgs, group);
/* Computing avg_load makes sense only when group is overloaded */
if (sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
@@ -10899,6 +11963,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
/* Select the overloaded group with highest avg_load. */
return sgs->avg_load > busiest->avg_load;
+ case group_llc_balance:
+ /* Select the group with most tasks preferring dst LLC */
+ return update_llc_busiest(env, busiest, sgs);
+
case group_imbalanced:
/*
* Select the 1st imbalanced group as we don't have any way to
@@ -11161,6 +12229,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
return false;
break;
+ case group_llc_balance:
case group_imbalanced:
case group_asym_packing:
case group_smt_balance:
@@ -11293,6 +12362,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
return NULL;
break;
+ case group_llc_balance:
case group_imbalanced:
case group_asym_packing:
case group_smt_balance:
@@ -11547,6 +12617,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+#ifdef CONFIG_SCHED_CACHE
+ if (busiest->group_type == group_llc_balance) {
+ /* Move a task that prefer local LLC */
+ env->migration_type = migrate_llc_task;
+ env->imbalance = 1;
+ return;
+ }
+#endif
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -11793,7 +12872,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
* group's child domain.
*/
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- sibling_imbalance(env, &sds, busiest, local) > 1)
+ (busiest->group_type == group_llc_balance ||
+ sibling_imbalance(env, &sds, busiest, local) > 1))
goto force_balance;
if (busiest->group_type != group_overloaded) {
@@ -11852,7 +12932,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ unsigned int __maybe_unused busiest_pref_llc = 0;
+ struct sched_domain __maybe_unused *sd_tmp;
unsigned int busiest_nr = 0;
+ int __maybe_unused dst_llc;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
@@ -11980,6 +13063,23 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
break;
+ case migrate_llc_task:
+#ifdef CONFIG_SCHED_CACHE
+ sd_tmp = rcu_dereference_all(rq->sd);
+ dst_llc = llc_id(env->dst_cpu);
+
+ if (sd_tmp && (unsigned)dst_llc < sd_tmp->llc_max) {
+ unsigned int this_pref_llc =
+ sd_tmp->llc_counts[dst_llc];
+
+ if (busiest_pref_llc < this_pref_llc) {
+ busiest_pref_llc = this_pref_llc;
+ busiest = rq;
+ }
+ }
+#endif
+ break;
+
}
}
@@ -12031,6 +13131,9 @@ static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
+ if (alb_break_llc(env))
+ return 0;
+
if (asym_active_balance(env))
return 1;
@@ -12050,7 +13153,8 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
- if (env->migration_type == migrate_misfit)
+ if (env->migration_type == migrate_misfit ||
+ env->migration_type == migrate_llc_task)
return 1;
return 0;
@@ -12143,6 +13247,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
case migrate_misfit:
__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
break;
+ case migrate_llc_task:
+ break;
}
}
@@ -12346,9 +13452,16 @@ more_balance:
*
* Similarly for migration_misfit which is not related to
* load/util migration, don't pollute nr_balance_failed.
+ *
+ * The same for cache aware scheduling's allowance for
+ * load imbalance. If regular load balance does not
+ * migrate task due to LLC locality, it is a expected
+ * behavior and don't pollute nr_balance_failed.
+ * See can_migrate_task().
*/
if (idle != CPU_NEWLY_IDLE &&
- env.migration_type != migrate_misfit)
+ env.migration_type != migrate_misfit &&
+ !(env.flags & LBF_LLC_PINNED))
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@@ -13756,6 +14869,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+ task_tick_cache(rq, curr);
+
update_misfit_status(curr, rq);
check_update_overutilized_status(task_rq(curr));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bfb4b47c021b..8eb8f83db6b0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1187,6 +1187,12 @@ struct rq {
struct scx_rq scx;
struct sched_dl_entity ext_server;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ raw_spinlock_t cpu_epoch_lock ____cacheline_aligned;
+ u64 cpu_runtime;
+ unsigned long cpu_epoch;
+ unsigned long cpu_epoch_next;
+#endif
struct sched_dl_entity fair_server;
@@ -1199,6 +1205,12 @@ struct rq {
#ifdef CONFIG_NUMA_BALANCING
unsigned int numa_migrate_on;
#endif
+
+#ifdef CONFIG_SCHED_CACHE
+ unsigned int nr_pref_llc_running;
+ unsigned int nr_llc_running;
+#endif
+
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
@@ -1546,6 +1558,14 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void);
extern void sched_core_put(void);
+static inline bool task_has_sched_core(struct task_struct *p)
+{
+ if (sched_core_disabled())
+ return false;
+
+ return !!p->core_cookie;
+}
+
#else /* !CONFIG_SCHED_CORE: */
static inline bool sched_core_enabled(struct rq *rq)
@@ -1586,6 +1606,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
return true;
}
+static inline bool task_has_sched_core(struct task_struct *p)
+{
+ return false;
+}
+
#endif /* !CONFIG_SCHED_CORE */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -2076,6 +2101,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p)
#endif /* !CONFIG_NUMA_BALANCING */
+int task_llc(const struct task_struct *p);
+
static inline void
queue_balance_callback(struct rq *rq,
struct balance_callback *head,
@@ -2164,6 +2191,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(int, sd_share_id);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
@@ -4031,6 +4059,29 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
+#ifdef CONFIG_SCHED_CACHE
+DECLARE_STATIC_KEY_FALSE(sched_cache_present);
+DECLARE_STATIC_KEY_FALSE(sched_cache_active);
+extern int sysctl_sched_cache_user;
+extern unsigned int llc_aggr_tolerance;
+extern unsigned int llc_epoch_period;
+extern unsigned int llc_epoch_affinity_timeout;
+extern unsigned int llc_imb_pct;
+extern unsigned int llc_overaggr_pct;
+
+static inline bool sched_cache_enabled(void)
+{
+ return static_branch_unlikely(&sched_cache_active);
+}
+
+extern void sched_cache_active_set(void);
+
+#endif
+
+void sched_domains_free_llc_id(int cpu);
+
+extern void init_sched_mm(struct task_struct *p);
+
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
static inline
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f96d50131495..dbfd9657f897 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -19,8 +19,10 @@ void sched_domains_mutex_unlock(void)
}
/* Protected by sched_domains_mutex: */
+static cpumask_var_t sched_domains_llc_id_allocmask;
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
+int max_lid;
static int __init sched_debug_setup(char *str)
{
@@ -632,6 +634,11 @@ static void destroy_sched_domain(struct sched_domain *sd)
if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
kfree(sd->shared);
+
+#ifdef CONFIG_SCHED_CACHE
+ /* only the bottom sd has llc_counts array */
+ kfree(sd->llc_counts);
+#endif
kfree(sd);
}
@@ -663,8 +670,9 @@ static void destroy_sched_domains(struct sched_domain *sd)
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_llc_id) = -1;
DEFINE_PER_CPU(int, sd_share_id);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
@@ -680,38 +688,19 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
int size = 1;
- sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
- /*
- * The shared object is attached to sd_asym_cpucapacity only when the
- * asym domain is non-overlapping (i.e., not built from SD_NUMA).
- * On overlapping (NUMA) asym domains we fall back to letting the
- * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
- * here.
- */
- if (sd && sd->shared)
- sds = sd->shared;
-
- rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
-
sd = highest_flag_domain(cpu, SD_SHARE_LLC);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- /*
- * If sd_asym_cpucapacity didn't claim the shared object,
- * sd_llc must have one linked.
- */
- if (!sds) {
- WARN_ON_ONCE(!sd->shared);
- sds = sd->shared;
- }
+ /* If sd_llc exists, sd_llc_shared should exist too. */
+ WARN_ON_ONCE(!sd->shared);
+ sds = sd->shared;
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
- rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
+ rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
if (sd)
@@ -729,6 +718,20 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
+
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+ /*
+ * The shared object is attached to sd_asym_cpucapacity only when the
+ * asym domain is non-overlapping (i.e., not built from SD_NUMA).
+ * On overlapping (NUMA) asym domains we fall back to letting the
+ * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
+ * here.
+ */
+ if (sd && sd->shared)
+ sds = sd->shared;
+
+ rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+ rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
}
/*
@@ -777,10 +780,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd && sd_degenerate(sd)) {
tmp = sd;
sd = sd->parent;
- destroy_sched_domain(tmp);
+
if (sd) {
struct sched_group *sg = sd->groups;
+#ifdef CONFIG_SCHED_CACHE
+ /* move buffer to parent as child is being destroyed */
+ sd->llc_counts = tmp->llc_counts;
+ sd->llc_max = tmp->llc_max;
+ sd->llc_bytes = tmp->llc_bytes;
+ /* make sure destroy_sched_domain() does not free it */
+ tmp->llc_counts = NULL;
+ tmp->llc_max = 0;
+ tmp->llc_bytes = 0;
+#endif
/*
* sched groups hold the flags of the child sched
* domain for convenience. Clear such flags since
@@ -792,6 +805,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sd->child = NULL;
}
+
+ destroy_sched_domain(tmp);
}
sched_domain_debug(sd, cpu);
@@ -819,6 +834,239 @@ enum s_alloc {
sa_none,
};
+#ifdef CONFIG_SCHED_CACHE
+/* hardware support for cache aware scheduling */
+DEFINE_STATIC_KEY_FALSE(sched_cache_present);
+/*
+ * Indicator of whether cache aware scheduling
+ * is active, used by the scheduler.
+ */
+DEFINE_STATIC_KEY_FALSE(sched_cache_active);
+/* user wants cache aware scheduling [0 or 1] */
+int sysctl_sched_cache_user = 1;
+
+/*
+ * Get the effective LLC size in bytes that @cpu's bottom sched_domain
+ * can use. A CPU within a cpuset partition can only use a proportion
+ * of the physical LLC, scaled by the ratio of the partition's span
+ * weight to the hardware LLC sharing weight. @sd should be the
+ * topmost domain with SD_SHARE_LLC.
+ *
+ * Returns 0 if cacheinfo is not yet populated. This happens during
+ * early boot when build_sched_domains() runs before the generic
+ * cacheinfo framework has been initialized (cacheinfo_cpu_online()
+ * is a device_initcall cpuhp callback). In that case,
+ * cacheinfo_cpu_online() will later call sched_update_llc_bytes()
+ * to fill in the bottom domain's llc_bytes once the cache attributes
+ * are available.
+ */
+static unsigned long get_effective_llc_bytes(int cpu,
+ struct sched_domain *sd)
+{
+ struct cacheinfo *ci;
+ unsigned int hw_weight;
+
+ ci = get_cpu_cacheinfo_llc(cpu);
+ if (!ci)
+ return 0;
+
+ hw_weight = cpumask_weight(&ci->shared_cpu_map);
+ if (!hw_weight)
+ return 0;
+
+ return div_u64((u64)ci->size * sd->span_weight, hw_weight);
+}
+
+static bool alloc_sd_llc(const struct cpumask *cpu_map,
+ struct s_data *d)
+{
+ struct sched_domain *sd, *top_llc, *parent;
+ unsigned int *p;
+ int i;
+
+ for_each_cpu(i, cpu_map) {
+ sd = *per_cpu_ptr(d->sd, i);
+ if (!sd)
+ goto err;
+
+ p = kcalloc_node(max_lid + 1, sizeof(unsigned int),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!p)
+ goto err;
+
+ top_llc = sd;
+ /*
+ * Find the topmost SD_SHARE_LLC domain.
+ * Not yet attached to the CPU, so per_cpu(sd_llc, i)
+ * can not be used.
+ */
+ while ((parent = rcu_dereference_protected(top_llc->parent, true)) &&
+ (parent->flags & SD_SHARE_LLC))
+ top_llc = parent;
+
+ if (top_llc->flags & SD_SHARE_LLC) {
+ sd->llc_max = max_lid + 1;
+ sd->llc_counts = p;
+ sd->llc_bytes = get_effective_llc_bytes(i, top_llc);
+ } else {
+ /* avoid memory leak */
+ kfree(p);
+ }
+ }
+
+ return true;
+err:
+ for_each_cpu(i, cpu_map) {
+ sd = *per_cpu_ptr(d->sd, i);
+ if (sd) {
+ kfree(sd->llc_counts);
+ sd->llc_counts = NULL;
+ sd->llc_max = 0;
+ sd->llc_bytes = 0;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * Enable/disable cache aware scheduling according to
+ * user input and the presence of hardware support.
+ */
+static void _sched_cache_active_set(void)
+{
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&sched_domains_mutex);
+
+ /* hardware does not support */
+ if (!static_branch_likely(&sched_cache_present)) {
+ static_branch_disable_cpuslocked(&sched_cache_active);
+ if (sched_debug())
+ pr_info("%s: cache aware scheduling not supported on this platform\n", __func__);
+ return;
+ }
+
+ /*
+ * user wants it or not ?
+ * TBD: read before writing the static key.
+ * It is not in the critical path, leave as-is
+ * for now.
+ */
+ if (sysctl_sched_cache_user) {
+ static_branch_enable_cpuslocked(&sched_cache_active);
+ if (sched_debug())
+ pr_info("%s: enabling cache aware scheduling\n", __func__);
+ } else {
+ static_branch_disable_cpuslocked(&sched_cache_active);
+ if (sched_debug())
+ pr_info("%s: disabling cache aware scheduling\n", __func__);
+ }
+}
+
+/* used by debugfs */
+void sched_cache_active_set(void)
+{
+ cpus_read_lock();
+ sched_domains_mutex_lock();
+ _sched_cache_active_set();
+ sched_domains_mutex_unlock();
+ cpus_read_unlock();
+}
+
+/*
+ * Update the bottom sched_domain's llc_bytes for @cpu and all its
+ * LLC siblings. Called from cacheinfo_cpu_online() or
+ * cacheinfo_cpu_pre_down() with cpu hotplug lock held.
+ *
+ * Note: get_effective_llc_bytes() returns 0 on PowerPC.
+ * thus cache aware scheduling is disabled on PowerPC for
+ * now. PowerPC does not use the generic cacheinfo framework --
+ * it has its own cacheinfo with a separate struct cache hierarchy
+ * and does not populates the per-CPU struct cpu_cacheinfo array
+ * that get_cpu_cacheinfo_llc() reads.
+ */
+void sched_update_llc_bytes(unsigned int cpu)
+{
+ struct sched_domain *sd, *sdp;
+ unsigned int i;
+
+ sched_domains_mutex_lock();
+
+ sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu));
+ if (!sdp)
+ goto unlock;
+
+ /*
+ * ci->shared_cpu_map is built incrementally as CPUs come
+ * online, so the first CPU in an LLC initially sees
+ * hw_weight == 1 and computes an inflated llc_bytes in
+ * get_effective_llc_bytes(). Re-evaluating every LLC
+ * sibling on each online event corrects this once the full
+ * shared_cpu_map is known.
+ */
+ for_each_cpu(i, sched_domain_span(sdp)) {
+ sd = rcu_dereference_sched_domain(cpu_rq(i)->sd);
+ if (sd)
+ sd->llc_bytes = get_effective_llc_bytes(i, sdp);
+ }
+
+unlock:
+ sched_domains_mutex_unlock();
+}
+
+static void sched_cache_set(bool has_multi_llcs)
+{
+ /*
+ * TBD: check before writing to it. sched domain rebuild
+ * is not in the critical path, leave as-is for now.
+ */
+ if (has_multi_llcs)
+ static_branch_enable_cpuslocked(&sched_cache_present);
+ else
+ static_branch_disable_cpuslocked(&sched_cache_present);
+
+ _sched_cache_active_set();
+}
+#else
+static bool alloc_sd_llc(const struct cpumask *cpu_map,
+ struct s_data *d)
+{
+ return false;
+}
+static inline void sched_cache_set(bool has_multi_llcs) { }
+#endif
+
+/*
+ * Return true if @sd belongs to an LLC group whose enclosing
+ * partition spans more than one LLC. @sd must be the topmost
+ * SD_SHARE_LLC domain.
+ *
+ * Any duplicated parent domains with the same span as @sd are
+ * skipped: before cpu_attach_domain() degeneration these still
+ * exist, after degeneration the loop is a no-op. This makes the
+ * helper usable both during sched domain build and against an
+ * already-attached domain tree.
+ *
+ * Note: For systems with a single LLC per node, cache-aware
+ * scheduling is still enabled when multiple nodes exist.
+ * However, NUMA balancing decisions take precedence over
+ * cache-aware scheduling. Conversely, if there is only one
+ * LLC per partition, cache-aware scheduling should be disabled.
+ */
+static bool sd_in_multi_llcs(struct sched_domain *sd)
+{
+ struct sched_domain *sdp = sd->parent;
+
+ /* it does not make sense to aggregate to 1 CPU */
+ if (sd->span_weight == 1)
+ return false;
+
+ while (sdp && sdp->span_weight == sd->span_weight)
+ sdp = sdp->parent;
+
+ return !!sdp;
+}
+
/*
* Return the canonical balance CPU for this group, this is the first CPU
* of this group that's also in the balance mask.
@@ -1803,6 +2051,11 @@ const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu
{
return cpu_coregroup_mask(cpu);
}
+
+#define llc_mask(cpu) cpu_coregroup_mask(cpu)
+
+#else
+#define llc_mask(cpu) cpumask_of(cpu)
#endif
const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
@@ -2711,14 +2964,71 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
return true;
}
+static int __sched_domains_alloc_llc_id(void)
+{
+ int lid, max;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
+ /*
+ * llc_id space should never grow larger than the
+ * possible number of CPUs in the system.
+ */
+ if (lid >= nr_cpu_ids)
+ return -1;
+
+ __cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
+ max = cpumask_last(sched_domains_llc_id_allocmask);
+ if (max > max_lid)
+ max_lid = max;
+
+ return lid;
+}
+
+static void __sched_domains_free_llc_id(int cpu)
+{
+ int i, lid, max;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = per_cpu(sd_llc_id, cpu);
+ if (lid == -1 || lid >= nr_cpu_ids)
+ return;
+
+ per_cpu(sd_llc_id, cpu) = -1;
+
+ for_each_cpu(i, llc_mask(cpu)) {
+ /* An online CPU owns the llc_id. */
+ if (per_cpu(sd_llc_id, i) == lid)
+ return;
+ }
+
+ __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
+
+ max = cpumask_last(sched_domains_llc_id_allocmask);
+ /* shrink max lid to save memory */
+ if (max < max_lid)
+ max_lid = max;
+}
+
+void sched_domains_free_llc_id(int cpu)
+{
+ sched_domains_mutex_lock();
+ __sched_domains_free_llc_id(cpu);
+ sched_domains_mutex_unlock();
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
static int
-build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ bool *multi_llcs)
{
enum s_alloc alloc_state = sa_none;
+ bool has_multi_llcs = false;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
@@ -2736,6 +3046,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
+ int lid;
sd = NULL;
for_each_sd_topology(tl) {
@@ -2749,6 +3060,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
+
+ lid = per_cpu(sd_llc_id, i);
+ if (lid == -1) {
+ /* try to reuse the llc_id of its siblings */
+ for (int j = cpumask_first(llc_mask(i));
+ j < nr_cpu_ids;
+ j = cpumask_next(j, llc_mask(i))) {
+ if (i == j)
+ continue;
+
+ lid = per_cpu(sd_llc_id, j);
+
+ if (lid != -1) {
+ per_cpu(sd_llc_id, i) = lid;
+
+ break;
+ }
+ }
+
+ /* a new LLC is detected */
+ if (lid == -1)
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
+ }
}
if (WARN_ON(!topology_span_sane(cpu_map)))
@@ -2769,33 +3103,31 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
for_each_cpu(i, cpu_map) {
- bool asym_claimed = false;
-
sd = *per_cpu_ptr(d.sd, i);
if (!sd)
continue;
if (has_asym)
- asym_claimed = claim_asym_sched_domain_shared(&d, i);
+ claim_asym_sched_domain_shared(&d, i);
/* First, find the topmost SD_SHARE_LLC domain */
while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
sd = sd->parent;
if (sd->flags & SD_SHARE_LLC) {
- /*
- * Initialize the sd->shared for SD_SHARE_LLC unless
- * the asym path above already claimed it.
- */
- if (!asym_claimed)
- init_sched_domain_shared(&d, sd);
+ init_sched_domain_shared(&d, sd);
/*
* In presence of higher domains, adjust the
* NUMA imbalance stats for the hierarchy.
*/
- if (IS_ENABLED(CONFIG_NUMA) && sd->parent)
- adjust_numa_imbalance(sd);
+ if (sd->parent) {
+ if (IS_ENABLED(CONFIG_NUMA))
+ adjust_numa_imbalance(sd);
+
+ if (sd_in_multi_llcs(sd))
+ has_multi_llcs = true;
+ }
}
}
@@ -2810,6 +3142,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
init_sched_groups_capacity(i, sd);
}
+ alloc_sd_llc(cpu_map, &d);
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
@@ -2834,6 +3168,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
ret = 0;
error:
+ *multi_llcs = has_multi_llcs;
__free_domain_allocs(&d, alloc_state, cpu_map);
return ret;
@@ -2896,8 +3231,10 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
*/
int __init sched_init_domains(const struct cpumask *cpu_map)
{
+ bool multi_llcs;
int err;
+ zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -2909,7 +3246,9 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
- err = build_sched_domains(doms_cur[0], NULL);
+ err = build_sched_domains(doms_cur[0], NULL, &multi_llcs);
+ if (!err)
+ sched_cache_set(multi_llcs);
return err;
}
@@ -2982,6 +3321,7 @@ static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new
struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
+ bool has_multi_llcs = false, multi_llcs;
int i, j, n;
int new_topology;
@@ -3031,14 +3371,41 @@ match1:
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j]) &&
- dattrs_equal(dattr_new, i, dattr_cur, j))
+ dattrs_equal(dattr_new, i, dattr_cur, j)) {
+ /*
+ * Reused partition has to be taken care
+ * of here, because there could be a corner
+ * case that if the reused partition is skipped
+ * and only new partition is considered, an
+ * incorrect has_multi_llcs would be set. For
+ * example:
+ * If the only multi-LLC partition is reused
+ * and a new single-LLC partition is built,
+ * sched_cache_set(false) disables cache-aware
+ * scheduling globally despite the reused
+ * multi-LLC partition still being active.
+ */
+ struct sched_domain *sd;
+ int cpu = cpumask_first(doms_cur[j]);
+
+ guard(rcu)();
+ sd = rcu_dereference(cpu_rq(cpu)->sd);
+ while (sd && sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+ sd = sd->parent;
+ if (sd && (sd->flags & SD_SHARE_LLC) && sd->parent &&
+ sd_in_multi_llcs(sd))
+ has_multi_llcs = true;
goto match2;
+ }
}
/* No match - add a new doms_new */
- build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL,
+ &multi_llcs);
+ has_multi_llcs |= multi_llcs;
match2:
;
}
+ sched_cache_set(has_multi_llcs);
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
/* Build perf domains: */