diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-04-30 07:39:01 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-04-30 07:39:01 -0700 |
commit | 1f889ec62c3f0d8913f3c32f9aff2a1e15099346 (patch) | |
tree | 381a74ab73c865ba6d1c77df39898dd4f81cfeef /kernel/rcutree.c | |
parent | 6c24499f40d96bf07a85b709fb1bee5cea611a1d (diff) | |
parent | 8fcfae31719c0a6c03f2cf63f815b46d378d8be4 (diff) | |
download | lwn-1f889ec62c3f0d8913f3c32f9aff2a1e15099346.tar.gz lwn-1f889ec62c3f0d8913f3c32f9aff2a1e15099346.zip |
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar:
"The main changes in this cycle are mostly related to preparatory work
for the full-dynticks work:
- Remove restrictions on no-CBs CPUs, make RCU_FAST_NO_HZ take
advantage of numbered callbacks, do callback accelerations based on
numbered callbacks. Posted to LKML at
https://lkml.org/lkml/2013/3/18/960
- RCU documentation updates. Posted to LKML at
https://lkml.org/lkml/2013/3/18/570
- Miscellaneous fixes. Posted to LKML at
https://lkml.org/lkml/2013/3/18/594"
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
rcu: Make rcu_accelerate_cbs() note need for future grace periods
rcu: Abstract rcu_start_future_gp() from rcu_nocb_wait_gp()
rcu: Rename n_nocb_gp_requests to need_future_gp
rcu: Push lock release to rcu_start_gp()'s callers
rcu: Repurpose no-CBs event tracing to future-GP events
rcu: Rearrange locking in rcu_start_gp()
rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks
rcu: Accelerate RCU callbacks at grace-period end
rcu: Export RCU_FAST_NO_HZ parameters to sysfs
rcu: Distinguish "rcuo" kthreads by RCU flavor
rcu: Add event tracing for no-CBs CPUs' grace periods
rcu: Add event tracing for no-CBs CPUs' callback registration
rcu: Introduce proper blocking to no-CBs kthreads GP waits
rcu: Provide compile-time control for no-CBs CPUs
rcu: Tone down debugging during boot-up and shutdown.
rcu: Add softirq-stall indications to stall-warning messages
rcu: Documentation update
rcu: Make bugginess of code sample more evident
rcu: Fix hlist_bl_set_first_rcu() annotation
rcu: Delete unused rcu_node "wakemask" field
...
Diffstat (limited to 'kernel/rcutree.c')
-rw-r--r-- | kernel/rcutree.c | 260 |
1 files changed, 197 insertions, 63 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2f8530bc4b17..d8534308fd05 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -64,7 +64,7 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname, cr) { \ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ @@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ + .abbr = sabbr, \ } struct rcu_state rcu_sched_state = - RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); + RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ + if (rcu_nocb_needs_gp(rsp)) + return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) @@ -1035,10 +1040,11 @@ static void init_callback_list(struct rcu_data *rdp) { int i; + if (init_nocb_callback_list(rdp)) + return; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - init_nocb_callback_list(rdp); } /* @@ -1071,6 +1077,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, } /* + * Trace-event helper function for rcu_start_future_gp() and + * rcu_nocb_wait_gp(). + */ +static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, char *s) +{ + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, s); +} + +/* + * Start some future grace period, as needed to handle newly arrived + * callbacks. The required future grace periods are recorded in each + * rcu_node structure's ->need_future_gp field. + * + * The caller must hold the specified rcu_node structure's ->lock. + */ +static unsigned long __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +{ + unsigned long c; + int i; + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + /* + * Pick up grace-period number for new callbacks. If this + * grace period is already marked as needed, return to the caller. + */ + c = rcu_cbs_completed(rdp->rsp, rnp); + trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); + if (rnp->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); + return c; + } + + /* + * If either this rcu_node structure or the root rcu_node structure + * believe that a grace period is in progress, then we must wait + * for the one following, which is in "c". Because our request + * will be noticed at the end of the current grace period, we don't + * need to explicitly start one. + */ + if (rnp->gpnum != rnp->completed || + ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + rnp->need_future_gp[c & 0x1]++; + trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); + return c; + } + + /* + * There might be no grace period in progress. If we don't already + * hold it, acquire the root rcu_node structure's lock in order to + * start one (if needed). + */ + if (rnp != rnp_root) + raw_spin_lock(&rnp_root->lock); + + /* + * Get a new grace-period number. If there really is no grace + * period in progress, it will be smaller than the one we obtained + * earlier. Adjust callbacks as needed. Note that even no-CBs + * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + */ + c = rcu_cbs_completed(rdp->rsp, rnp_root); + for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) + if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) + rdp->nxtcompleted[i] = c; + + /* + * If the needed for the required grace period is already + * recorded, trace and leave. + */ + if (rnp_root->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); + goto unlock_out; + } + + /* Record the need for the future grace period. */ + rnp_root->need_future_gp[c & 0x1]++; + + /* If a grace period is not already in progress, start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); + } else { + trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); + rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + } +unlock_out: + if (rnp != rnp_root) + raw_spin_unlock(&rnp_root->lock); + return c; +} + +/* + * Clean up any old requests for the just-ended grace period. Also return + * whether any additional grace periods have been requested. Also invoke + * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads + * waiting for this grace period to complete. + */ +static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int c = rnp->completed; + int needmore; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + + rcu_nocb_gp_cleanup(rsp, rnp); + rnp->need_future_gp[c & 0x1] = 0; + needmore = rnp->need_future_gp[(c + 1) & 0x1]; + trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); + return needmore; +} + +/* * If there is room, assign a ->completed number to any callbacks on * this CPU that have not already been assigned. Also accelerate any * callbacks that were previously assigned a ->completed number that has @@ -1129,6 +1249,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtcompleted[i] = c; } + /* Record any needed additional grace periods. */ + rcu_start_future_gp(rnp, rdp); /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) @@ -1308,9 +1430,9 @@ static int rcu_gp_init(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; + ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; WARN_ON_ONCE(rnp->completed != rsp->completed); - rnp->completed = rsp->completed; + ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -1319,7 +1441,8 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes * 8)) == 0) + if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && + system_state == SYSTEM_RUNNING) schedule_timeout_uninterruptible(2); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); @@ -1361,6 +1484,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; + int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1390,17 +1514,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; + ACCESS_ONCE(rnp->completed) = rsp->gpnum; + rdp = this_cpu_ptr(rsp->rda); + if (rnp == rdp->mynode) + __rcu_process_gp_end(rsp, rnp, rdp); + nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); + rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); @@ -1476,57 +1606,62 @@ static int __noreturn rcu_gp_kthread(void *arg) /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. + * the root node's ->lock and hard irqs must be disabled. * * Note that it is legal for a dying CPU (which is marked as offline) to * invoke this function. This can happen when the dying CPU reports its * quiescent state. */ static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - - if (!rsp->gp_kthread || - !cpu_needs_another_gp(rsp, rdp)) { + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period * task, this CPU does not need another grace period, * or a grace period is already in progress. * Either way, don't start a new grace period. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - - /* - * Because there is no grace period in progress right now, - * any callbacks we have up to this point will be satisfied - * by the next grace period. So this is a good place to - * assign a grace period number to recently posted callbacks. - */ - rcu_accelerate_cbs(rsp, rnp, rdp); - rsp->gp_flags = RCU_GP_FLAG_INIT; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ - - /* Ensure that CPU is aware of completion of last grace period. */ - rcu_process_gp_end(rsp, rdp); - local_irq_restore(flags); /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); } /* + * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's + * callbacks. Note that rcu_start_gp_advanced() cannot do this because it + * is invoked indirectly from rcu_advance_cbs(), which would result in + * endless recursion -- or would do so if it wasn't for the self-deadlock + * that is encountered beforehand. + */ +static void +rcu_start_gp(struct rcu_state *rsp) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* + * If there is no grace period in progress right now, any + * callbacks we have up to this point will be satisfied by the + * next grace period. Also, advancing the callbacks reduces the + * probability of false positives from cpu_needs_another_gp() + * resulting in pointless grace periods. So, advance callbacks + * then start the grace period! + */ + rcu_advance_cbs(rsp, rnp, rdp); + rcu_start_gp_advanced(rsp, rnp, rdp); +} + +/* * Report a full set of quiescent states to the specified rcu_state * data structure. This involves cleaning up after the prior grace * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. + * if one is needed. Note that the caller must hold rnp->lock, which + * is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) @@ -2124,7 +2259,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ - rcu_start_gp(rsp, flags); /* releases above lock */ + rcu_start_gp(rsp); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); } else { local_irq_restore(flags); } @@ -2169,7 +2305,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) static void invoke_rcu_core(void) { - raise_softirq(RCU_SOFTIRQ); + if (cpu_online(smp_processor_id())) + raise_softirq(RCU_SOFTIRQ); } /* @@ -2204,11 +2341,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + raw_spin_lock(&rnp_root->lock); + rcu_start_gp(rsp); + raw_spin_unlock(&rnp_root->lock); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; @@ -2628,19 +2765,27 @@ static int rcu_pending(int cpu) } /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. + * Return true if the specified CPU has any callback. If all_lazy is + * non-NULL, store an indication of whether all callbacks are lazy. + * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu) +static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { + bool al = true; + bool hc = false; + struct rcu_data *rdp; struct rcu_state *rsp; - /* RCU callbacks either ready or pending? */ - for_each_rcu_flavor(rsp) - if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) - return 1; - return 0; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->qlen != rdp->qlen_lazy) + al = false; + if (rdp->nxtlist) + hc = true; + } + if (all_lazy) + *all_lazy = al; + return hc; } /* @@ -2859,7 +3004,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* Add CPU to rcu_node bitmasks. */ @@ -2909,7 +3053,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - int ret = NOTIFY_OK; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2923,21 +3066,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - if (nocb_cpu_expendable(cpu)) - rcu_boost_kthread_setaffinity(rnp, cpu); - else - ret = NOTIFY_BAD; + rcu_boost_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: - /* - * The whole machine is "stopped" except this CPU, so we can - * touch any data without introducing corruption. We send the - * dying CPU's callbacks to an arbitrarily chosen online CPU. - */ for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); - rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -2950,7 +3084,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; } trace_rcu_utilization("End CPU hotplug"); - return ret; + return NOTIFY_OK; } /* @@ -3085,6 +3219,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); + rcu_init_one_nocb(rnp); } } @@ -3170,8 +3305,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - rcu_init_nocb(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because |