summaryrefslogtreecommitdiff
path: root/kernel/time/timer_migration.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time/timer_migration.c')
-rw-r--r--kernel/time/timer_migration.c241
1 files changed, 183 insertions, 58 deletions
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 52c15affdbff..806c23cf71fc 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -102,7 +102,7 @@
* active CPU/group information atomic_try_cmpxchg() is used instead and only
* the per CPU tmigr_cpu->lock is held.
*
- * During the setup of groups tmigr_level_list is required. It is protected by
+ * During the setup of groups, hier->level_list is required. It is protected by
* @tmigr_mutex.
*
* When @timer_base->lock as well as tmigr related locks are required, the lock
@@ -416,13 +416,12 @@
*/
static DEFINE_MUTEX(tmigr_mutex);
-static struct list_head *tmigr_level_list __read_mostly;
+
+static LIST_HEAD(tmigr_hierarchy_list);
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
-static struct tmigr_group *tmigr_root;
-
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
/*
@@ -1469,6 +1468,34 @@ static long tmigr_trigger_active(void *unused)
return 0;
}
+static unsigned int tmigr_get_capacity(int cpu)
+{
+ /*
+ * nohz_full CPUs need to make sure there is always an available (online)
+ * and never idle migrator to handle all their global timers. That duty
+ * is served by the timekeeper which then never stops its tick. But the
+ * timekeeper must then belong to the same hierarchy as all the nohz_full
+ * CPUs. Simply turn off capacity awareness when nohz_full is running.
+ */
+ if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN))
+ return SCHED_CAPACITY_SCALE;
+ else
+ return arch_scale_cpu_capacity(cpu);
+}
+
+static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu)
+{
+ unsigned int capacity = tmigr_get_capacity(cpu);
+ struct tmigr_hierarchy *iter;
+
+ list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
+ if (iter->capacity == capacity)
+ return iter;
+ }
+
+ return NULL;
+}
+
static int tmigr_clear_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1493,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
}
if (firstexp != KTIME_MAX) {
- migrator = cpumask_any(tmigr_available_cpumask);
- work_on_cpu(migrator, tmigr_trigger_active, NULL);
+ struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu);
+
+ if (WARN_ON_ONCE(!hier))
+ return -EINVAL;
+
+ migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask);
+ if (migrator < nr_cpu_ids) {
+ work_on_cpu(migrator, tmigr_trigger_active, NULL);
+ } else {
+ /*
+ * If deactivation returned an expiration, it belongs to an available
+ * nohz CPU in the hierarchy.
+ */
+ WARN_ONCE(1, "Expected available CPU in the hierarchy\n");
+ }
}
return 0;
@@ -1657,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true;
}
-static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl)
{
struct tmigr_group *tmp, *group = NULL;
lockdep_assert_held(&tmigr_mutex);
/* Try to attach to an existing group first */
- list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
+ list_for_each_entry(tmp, &hier->level_list[lvl], list) {
/*
* If @lvl is below the cross NUMA node level, check whether
* this group belongs to the same NUMA node.
@@ -1698,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
tmigr_init_group(group, lvl, node);
/* Setup successful. Add it to the hierarchy */
- list_add(&group->list, &tmigr_level_list[lvl]);
+ list_add(&group->list, &hier->level_list[lvl]);
trace_tmigr_group_set(group);
return group;
}
-static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate)
{
- if (!group->parent && group != tmigr_root) {
+ if (!group->parent && group != hier->root) {
/*
* This is the new top-level, prepare its groupmask in advance
* to avoid accidents where yet another new top-level is
@@ -1721,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate)
}
-static void tmigr_connect_child_parent(struct tmigr_group *child,
- struct tmigr_group *parent,
- bool activate)
+static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child,
+ struct tmigr_group *parent, bool activate)
{
- if (tmigr_init_root(parent, activate)) {
+ if (tmigr_init_root(hier, parent, activate)) {
/*
* The previous top level had prepared its groupmask already,
* simply account it in advance as the first child. If some groups
@@ -1758,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
*/
smp_store_release(&child->parent, parent);
- trace_tmigr_connect_child_parent(child);
+ trace_tmigr_connect_child_parent(hier, child);
}
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
- struct tmigr_group *start, bool activate)
+static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
+ unsigned int node, struct tmigr_group *start, bool activate)
{
- struct tmigr_group *group, *child, **stack;
+ struct tmigr_group *root = hier->root, *group, *child, **stack;
int i, top = 0, err = 0, start_lvl = 0;
bool root_mismatch = false;
@@ -1777,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
start_lvl = start->level + 1;
}
- if (tmigr_root)
- root_mismatch = tmigr_root->numa_node != node;
+ if (root)
+ root_mismatch = root->numa_node != node;
for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
- group = tmigr_get_group(node, i);
+ group = tmigr_get_group(hier, node, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
i--;
@@ -1803,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
if (group->parent)
break;
if ((!root_mismatch || i >= tmigr_crossnode_level) &&
- list_is_singular(&tmigr_level_list[i]))
+ list_is_singular(&hier->level_list[i]))
break;
}
@@ -1831,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
- tmigr_init_root(group, activate);
+ tmigr_init_root(hier, group, activate);
- trace_tmigr_connect_cpu_parent(tmc);
+ trace_tmigr_connect_cpu_parent(hier, tmc);
/* There are no children that need to be connected */
continue;
} else {
child = stack[i - 1];
- tmigr_connect_child_parent(child, group, activate);
+ tmigr_connect_child_parent(hier, child, group, activate);
}
}
@@ -1895,18 +1934,23 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
data.childmask = start->groupmask;
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
}
+ } else if (start) {
+ union tmigr_state state;
+
+ /* Remote activation assumes the whole target's hierarchy is inactive */
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(state.active);
}
/* Root update */
- if (list_is_singular(&tmigr_level_list[top])) {
- group = list_first_entry(&tmigr_level_list[top],
- typeof(*group), list);
+ if (list_is_singular(&hier->level_list[top])) {
+ group = list_first_entry(&hier->level_list[top], typeof(*group), list);
WARN_ON_ONCE(group->parent);
- if (tmigr_root) {
+ if (root) {
/* Old root should be the same or below */
- WARN_ON_ONCE(tmigr_root->level > top);
+ WARN_ON_ONCE(root->level > top);
}
- tmigr_root = group;
+ hier->root = group;
}
out:
kfree(stack);
@@ -1914,34 +1958,123 @@ out:
return err;
}
+static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu)
+{
+ struct tmigr_hierarchy *hier;
+
+ hier = __tmigr_get_hierarchy(cpu);
+
+ if (hier)
+ return hier;
+
+ hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels);
+ if (!hier)
+ return ERR_PTR(-ENOMEM);
+
+ hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!hier->cpumask) {
+ kfree(hier);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (int i = 0; i < tmigr_hierarchy_levels; i++)
+ INIT_LIST_HEAD(&hier->level_list[i]);
+
+ hier->capacity = tmigr_get_capacity(cpu);
+ list_add_tail(&hier->node, &tmigr_hierarchy_list);
+
+ return hier;
+}
+
+static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
+ struct tmigr_group *old_root, bool activate)
+{
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == smp_processor_id());
+ if (activate) {
+ /*
+ * The current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+ }
+
+ return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+ struct tmigr_group *old_root = arg;
+ struct tmigr_hierarchy *hier;
+ int cpu = smp_processor_id();
+
+ hier = __tmigr_get_hierarchy(cpu);
+ if (WARN_ON_ONCE(!hier))
+ return -EINVAL;
+
+ return tmigr_connect_old_root(hier, cpu, old_root, true);
+}
+
static int tmigr_add_cpu(unsigned int cpu)
{
- struct tmigr_group *old_root = tmigr_root;
+ struct tmigr_hierarchy *hier;
+ struct tmigr_group *old_root;
int node = cpu_to_node(cpu);
int ret;
guard(mutex)(&tmigr_mutex);
- ret = tmigr_setup_groups(cpu, node, NULL, false);
+ hier = tmigr_get_hierarchy(cpu);
+ if (IS_ERR(hier))
+ return PTR_ERR(hier);
+
+ old_root = hier->root;
+
+ ret = tmigr_setup_groups(hier, cpu, node, NULL, false);
+
+ if (ret < 0)
+ return ret;
/* Root has changed? Connect the old one to the new */
- if (ret >= 0 && old_root && old_root != tmigr_root) {
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
- /*
- * The (likely) current CPU is expected to be online in the hierarchy,
- * otherwise the old root may not be active as expected.
- */
- WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
- ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+ if (old_root && old_root != hier->root) {
+ guard(migrate)();
+
+ if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
+ /*
+ * If the target belong to the same hierarchy, the old root is expected
+ * to be active. Link and propagate to the new root.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, true);
+ } else {
+ int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
+
+ if (target < nr_cpu_ids) {
+ /*
+ * If the target doesn't belong to the same hierarchy as the current
+ * CPU, activate from a relevant one to make sure the old root is
+ * active.
+ */
+ ret = work_on_cpu(target, connect_old_root_work, old_root);
+ } else {
+ /*
+ * No other available CPUs in the remote hierarchy. Link the
+ * old root remotely but don't propagate activation since the
+ * old root is not expected to be active.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, false);
+ }
+ }
}
+ if (ret >= 0)
+ cpumask_set_cpu(cpu, hier->cpumask);
+
return ret;
}
@@ -1974,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu)
static int __init tmigr_init(void)
{
- unsigned int cpulvl, nodelvl, cpus_per_node, i;
+ unsigned int cpulvl, nodelvl, cpus_per_node;
unsigned int nnodes = num_possible_nodes();
unsigned int ncpus = num_possible_cpus();
int ret = -ENOMEM;
@@ -2021,14 +2154,6 @@ static int __init tmigr_init(void)
*/
tmigr_crossnode_level = cpulvl;
- tmigr_level_list = kzalloc_objs(struct list_head,
- tmigr_hierarchy_levels);
- if (!tmigr_level_list)
- goto err;
-
- for (i = 0; i < tmigr_hierarchy_levels; i++)
- INIT_LIST_HEAD(&tmigr_level_list[i]);
-
pr_info("Timer migration: %d hierarchy levels; %d children per group;"
" %d crossnode level\n",
tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,