summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2012-04-23 15:52:53 -0700
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2012-07-02 12:33:20 -0700
commitf885b7f2b2de70be266d2cecc476f773a1e2ca5d (patch)
treec4f4d03ca1469f22701b848274034a74c5ae2b04
parentcba6d0d64ee53772b285d0c0c288deefbeaf7775 (diff)
downloadlwn-f885b7f2b2de70be266d2cecc476f773a1e2ca5d.tar.gz
lwn-f885b7f2b2de70be266d2cecc476f773a1e2ca5d.zip
rcu: Control RCU_FANOUT_LEAF from boot-time parameter
Although making RCU_FANOUT_LEAF a kernel configuration parameter rather than a fixed constant makes it easier for people to decrease cache-miss overhead for large systems, it is of little help for people who must run a single pre-built kernel binary. This commit therefore allows the value of RCU_FANOUT_LEAF to be increased (but not decreased!) via a boot-time parameter named rcutree.rcu_fanout_leaf. Reported-by: Mike Galbraith <efault@gmx.de> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--kernel/rcutree.c97
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h2
-rw-r--r--kernel/rcutree_trace.c2
5 files changed, 104 insertions, 25 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a92c5ebf373e..12783fa833c3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2367,6 +2367,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Set maximum number of finished RCU callbacks to process
in one batch.
+ rcutree.fanout_leaf= [KNL,BOOT]
+ Increase the number of CPUs assigned to each
+ leaf rcu_node structure. Useful for very large
+ systems.
+
rcutree.qhimark= [KNL,BOOT]
Set threshold of queued
RCU callbacks over which batch limiting is disabled.
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4b97bba7396e..a4c592b66e10 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,17 +60,10 @@
/* Data structures. */
-static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
#define RCU_STATE_INITIALIZER(structname) { \
.level = { &structname##_state.node[0] }, \
- .levelcnt = { \
- NUM_RCU_LVL_0, /* root of hierarchy. */ \
- NUM_RCU_LVL_1, \
- NUM_RCU_LVL_2, \
- NUM_RCU_LVL_3, \
- NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
- }, \
.fqs_state = RCU_GP_IDLE, \
.gpnum = -300, \
.completed = -300, \
@@ -91,6 +84,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
+ NUM_RCU_LVL_0,
+ NUM_RCU_LVL_1,
+ NUM_RCU_LVL_2,
+ NUM_RCU_LVL_3,
+ NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
+
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
@@ -2574,9 +2580,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;
- for (i = NUM_RCU_LVLS - 1; i > 0; i--)
+ for (i = rcu_num_lvls - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
+ rsp->levelspread[0] = rcu_fanout_leaf;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2586,7 +2592,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
int i;
cprv = NR_CPUS;
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
ccur = rsp->levelcnt[i];
rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
cprv = ccur;
@@ -2613,13 +2619,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
/* Initialize the level-tracking arrays. */
- for (i = 1; i < NUM_RCU_LVLS; i++)
+ for (i = 0; i < rcu_num_lvls; i++)
+ rsp->levelcnt[i] = num_rcu_lvl[i];
+ for (i = 1; i < rcu_num_lvls; i++)
rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
rcu_init_levelspread(rsp);
/* Initialize the elements themselves, starting from the leaves. */
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
cpustride *= rsp->levelspread[i];
rnp = rsp->level[i];
for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2649,7 +2657,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
rsp->rda = rda;
- rnp = rsp->level[NUM_RCU_LVLS - 1];
+ rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
@@ -2658,11 +2666,72 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
}
+/*
+ * Compute the rcu_node tree geometry from kernel parameters. This cannot
+ * replace the definitions in rcutree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+ int i;
+ int j;
+ int n = NR_CPUS;
+ int rcu_capacity[MAX_RCU_LVLS + 1];
+
+ /* If the compile-time values are accurate, just leave. */
+ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
+ return;
+
+ /*
+ * Compute number of nodes that can be handled an rcu_node tree
+ * with the given number of levels. Setting rcu_capacity[0] makes
+ * some of the arithmetic easier.
+ */
+ rcu_capacity[0] = 1;
+ rcu_capacity[1] = rcu_fanout_leaf;
+ for (i = 2; i <= MAX_RCU_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+
+ /*
+ * The boot-time rcu_fanout_leaf parameter is only permitted
+ * to increase the leaf-level fanout, not decrease it. Of course,
+ * the leaf-level fanout cannot exceed the number of bits in
+ * the rcu_node masks. Finally, the tree must be able to accommodate
+ * the configured number of CPUs. Complain and fall back to the
+ * compile-time values if these limits are exceeded.
+ */
+ if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+ rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+ n > rcu_capacity[MAX_RCU_LVLS]) {
+ WARN_ON(1);
+ return;
+ }
+
+ /* Calculate the number of rcu_nodes at each level of the tree. */
+ for (i = 1; i <= MAX_RCU_LVLS; i++)
+ if (n <= rcu_capacity[i]) {
+ for (j = 0; j <= i; j++)
+ num_rcu_lvl[j] =
+ DIV_ROUND_UP(n, rcu_capacity[i - j]);
+ rcu_num_lvls = i;
+ for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+ num_rcu_lvl[j] = 0;
+ break;
+ }
+
+ /* Calculate the total number of rcu_node structures. */
+ rcu_num_nodes = 0;
+ for (i = 0; i <= MAX_RCU_LVLS; i++)
+ rcu_num_nodes += num_rcu_lvl[i];
+ rcu_num_nodes -= n;
+}
+
void __init rcu_init(void)
{
int cpu;
rcu_bootup_announce();
+ rcu_init_geometry();
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 19b61ac1079f..780a0195d35a 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -42,28 +42,28 @@
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
#if NR_CPUS <= RCU_FANOUT_1
-# define NUM_RCU_LVLS 1
+# define RCU_NUM_LVLS 1
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 (NR_CPUS)
# define NUM_RCU_LVL_2 0
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_2
-# define NUM_RCU_LVLS 2
+# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_2 (NR_CPUS)
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_3
-# define NUM_RCU_LVLS 3
+# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_3 (NR_CPUS)
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_4
-# define NUM_RCU_LVLS 4
+# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,6 +76,9 @@
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+
/*
* Dynticks per-CPU state.
*/
@@ -206,7 +209,7 @@ struct rcu_node {
*/
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -215,7 +218,7 @@ struct rcu_node {
*/
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+ (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
/*
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -224,8 +227,8 @@ struct rcu_node {
* It is still a leaf node, even if it is also the root node.
*/
#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/* Index values for nxttail array in struct rcu_data. */
#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
@@ -357,9 +360,9 @@ do { \
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
+ struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
+ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
/* The following fields are guarded by the root rcu_node's lock. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3e4899459f3d..fb92b6dd9980 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -70,6 +70,8 @@ static void __init rcu_bootup_announce_oddness(void)
#if NUM_RCU_LVL_4 != 0
printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
#endif
+ if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+ printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
}
#ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..a3556a2d842c 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -278,7 +278,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
- for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+ for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
level = rnp->level;