From 832aa35a65bac800a1adbf2eab0b42427032cab8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Aug 2018 13:37:47 -0700 Subject: doc: Set down forward-progress requirements This commit adds a section to the requirements documentation setting down requirements for grace-period and callback-invocation forward progress. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 110 ++++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 43c4e2f05f40..7efc1c1da7af 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1381,6 +1381,7 @@ Classes of quality-of-implementation requirements are as follows:
  1. Specialization
  2. Performance and Scalability +
  3. Forward Progress
  4. Composability
  5. Corner Cases
@@ -1822,6 +1823,106 @@ so it is too early to tell whether they will stand the test of time. RCU thus provides a range of tools to allow updaters to strike the required tradeoff between latency, flexibility and CPU overhead. +

Forward Progress

+ +

+In theory, delaying grace-period completion and callback invocation +is harmless. +In practice, not only are memory sizes finite but also callbacks sometimes +do wakeups, and sufficiently deferred wakeups can be difficult +to distinguish from system hangs. +Therefore, RCU must provide a number of mechanisms to promote forward +progress. + +

+These mechanisms are not foolproof, nor can they be. +For one simple example, an infinite loop in an RCU read-side critical +section must by definition prevent later grace periods from ever completing. +For a more involved example, consider a 64-CPU system built with +CONFIG_RCU_NOCB_CPU=y and booted with rcu_nocbs=1-63, +where CPUs 1 through 63 spin in tight loops that invoke +call_rcu(). +Even if these tight loops also contain calls to cond_resched() +(thus allowing grace periods to complete), CPU 0 simply will +not be able to invoke callbacks as fast as the other 63 CPUs can +register them, at least not until the system runs out of memory. +In both of these examples, the Spiderman principle applies: With great +power comes great responsibility. +However, short of this level of abuse, RCU is required to +ensure timely completion of grace periods and timely invocation of +callbacks. + +

+RCU takes the following steps to encourage timely completion of +grace periods: + +

    +
  1. If a grace period fails to complete within 100 milliseconds, + RCU causes future invocations of cond_resched() on + the holdout CPUs to provide an RCU quiescent state. + RCU also causes those CPUs' need_resched() invocations + to return true, but only after the corresponding CPU's + next scheduling-clock. +
  2. CPUs mentioned in the nohz_full kernel boot parameter + can run indefinitely in the kernel without scheduling-clock + interrupts, which defeats the above need_resched() + strategem. + RCU will therefore invoke resched_cpu() on any + nohz_full CPUs still holding out after + 109 milliseconds. +
  3. In kernels built with CONFIG_RCU_BOOST=y, if a given + task that has been preempted within an RCU read-side critical + section is holding out for more than 500 milliseconds, + RCU will resort to priority boosting. +
  4. If a CPU is still holding out 10 seconds into the grace + period, RCU will invoke resched_cpu() on it regardless + of its nohz_full state. +
+ +

+The above values are defaults for systems running with HZ=1000. +They will vary as the value of HZ varies, and can also be +changed using the relevant Kconfig options and kernel boot parameters. +RCU currently does not do much sanity checking of these +parameters, so please use caution when changing them. +Note that these forward-progress measures are provided only for RCU, +not for +SRCU or +Tasks RCU. + +

+RCU takes the following steps in call_rcu() to encourage timely +invocation of callbacks when any given non-rcu_nocbs CPU has +10,000 callbacks, or has 10,000 more callbacks than it had the last time +encouragement was provided: + +

    +
  1. Starts a grace period, if one is not already in progress. +
  2. Forces immediate checking for quiescent states, rather than + waiting for three milliseconds to have elapsed since the + beginning of the grace period. +
  3. Immediately tags the CPU's callbacks with their grace period + completion numbers, rather than waiting for the RCU_SOFTIRQ + handler to get around to it. +
  4. Lifts callback-execution batch limits, which speeds up callback + invocation at the expense of degrading realtime response. +
+ +

+Again, these are default values when running at HZ=1000, +and can be overridden. +Again, these forward-progress measures are provided only for RCU, +not for +SRCU or +Tasks RCU. +Even for RCU, callback-invocation forward progress for rcu_nocbs +CPUs is much less well-developed, in part because workloads benefiting +from rcu_nocbs CPUs tend to invoke call_rcu() +relatively infrequently. +If workloads emerge that need both rcu_nocbs CPUs and high +call_rcu() invocation rates, then additional forward-progress +work will be required. +

Composability

@@ -2272,7 +2373,7 @@ that meets this requirement. Furthermore, NMI handlers can be interrupted by what appear to RCU to be normal interrupts. One way that this can happen is for code that directly invokes -rcu_irq_enter() and rcu_irq_exit() to be called +rcu_irq_enter() and rcu_irq_exit() to be called from an NMI handler. This astonishing fact of life prompted the current code structure, which has rcu_irq_enter() invoking rcu_nmi_enter() @@ -2294,7 +2395,7 @@ via del_timer_sync() or similar.

Unfortunately, there is no way to cancel an RCU callback; once you invoke call_rcu(), the callback function is -going to eventually be invoked, unless the system goes down first. +eventually going to be invoked, unless the system goes down first. Because it is normally considered socially irresponsible to crash the system in response to a module unload request, we need some other way to deal with in-flight RCU callbacks. @@ -3233,6 +3334,11 @@ For example, RCU callback overhead might be charged back to the originating call_rcu() instance, though probably not in production kernels. +

+Additional work may be required to provide reasonable forward-progress +guarantees under heavy load for grace periods and for callback +invocation. +

Summary

-- cgit v1.2.3 From 2d0350a8f0e6eb5494141c61c5c749b5155df33d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 21 Sep 2018 18:31:53 -0400 Subject: doc: Clarify RCU data-structure comment about rcu_tree fanout RCU Data-Structures document describes a trick to test RCU with small number of CPUs but with a taller tree. It wasn't immediately clear how the document arrived at 16 CPUs which also requires setting the FANOUT_LEAF to 2 instead of the default of 16. This commit therefore provides the needed clarification. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Data-Structures/Data-Structures.html | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 1d2051c0c3fc..476b1ac38e4c 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -127,9 +127,11 @@ CPUs, RCU would configure the rcu_node tree as follows:

RCU currently permits up to a four-level tree, which on a 64-bit system accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for 32-bit systems. -On the other hand, you can set CONFIG_RCU_FANOUT to be -as small as 2 if you wish, which would permit only 16 CPUs, which -is useful for testing. +On the other hand, you can set both CONFIG_RCU_FANOUT and +CONFIG_RCU_FANOUT_LEAF to be as small as 2, which would result +in a 16-CPU test using a 4-level tree. +This can be useful for testing large-system capabilities on small test +machines.

This multi-level combining tree allows us to get most of the performance and scalability -- cgit v1.2.3 From dd944caa8173a19c702076471aae17a2d793ebeb Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 22 Sep 2018 19:41:27 -0400 Subject: doc: Remove rcu_preempt_state reference in stallwarn Consolidation of RCU-bh, RCU-preempt, and RCU-sched into one RCU flavor to rule them all resulted in the removal of rcu_preempt_state. However, stallwarn.txt still mentions rcu_preempt_state. This commit therefore Updates stallwarn documentation accordingly. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 491043fd976f..b01bcafc64aa 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -176,9 +176,8 @@ causing stalls, and that the stall was affecting RCU-sched. This message will normally be followed by stack dumps for each CPU. Please note that PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that the tasks will be indicated by PID, for example, "P3421". It is even -possible for a rcu_preempt_state stall to be caused by both CPUs -and- -tasks, in which case the offending CPUs and tasks will all be called -out in the list. +possible for an rcu_state stall to be caused by both CPUs -and- tasks, +in which case the offending CPUs and tasks will all be called out in the list. CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with the RCU core for the past three grace periods. In contrast, CPU 16's "(0 -- cgit v1.2.3 From 5cc379a42acd7104747077db7aaf4b01115ee484 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 25 Sep 2018 11:25:57 -0700 Subject: doc: Update information about resched_cpu Since commit fced9c8cfe6b ("rcu: Avoid resched_cpu() when rescheduling the current CPU"), resched_cpu is not directly called from sync_sched_exp_handler. Update the documentation about the same. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html index e62c7c34a369..8e4f873b979f 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html @@ -160,9 +160,9 @@ was in flight. If the CPU is idle, then sync_sched_exp_handler() reports the quiescent state. -

-Otherwise, the handler invokes resched_cpu(), which forces -a future context switch. +

Otherwise, the handler forces a future context switch by setting the +NEED_RESCHED flag of the current task's thread flag and the CPU preempt +counter. At the time of the context switch, the CPU reports the quiescent state. Should the CPU go offline first, it will report the quiescent state at that time. -- cgit v1.2.3 From c9b6f899e120c83ef144b3d4a8365413ef49cce4 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 3 Oct 2018 17:37:25 -0700 Subject: doc: Remove rcu_dynticks from Data-Structures rcu_dynticks was folded into rcu_data structure. Update the data structures RCU document accordingly. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../Data-Structures/BigTreeClassicRCUBHdyntick.svg | 695 --------------------- .../Design/Data-Structures/Data-Structures.html | 90 +-- 2 files changed, 25 insertions(+), 760 deletions(-) delete mode 100644 Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg deleted file mode 100644 index 21ba7823479d..000000000000 --- a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg +++ /dev/null @@ -1,695 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rcu_bh - - struct - - rcu_node - - struct - - rcu_node - - rcu_node - - struct - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct rcu_state - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - rcu_sched - - - - - diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 476b1ac38e4c..4eb603e3a005 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -23,8 +23,6 @@ to each other. The rcu_segcblist Structure

  • The rcu_data Structure -
  • - The rcu_dynticks Structure
  • The rcu_head Structure
  • @@ -174,16 +172,8 @@ said to be in dyntick-idle mode. RCU must handle dyntick-idle CPUs specially because RCU would otherwise wake up each CPU on every grace period, which would defeat the whole purpose of CONFIG_NO_HZ_IDLE. -RCU uses the rcu_dynticks structure to track -which CPUs are in dyntick idle mode, as shown below: - -

    BigTreeClassicRCUBHdyntick.svg - -

    However, if a CPU is in dyntick-idle mode, it is in that mode -for all flavors of RCU. -Therefore, a single rcu_dynticks structure is allocated per -CPU, and all of a given CPU's rcu_data structures share -that rcu_dynticks, as shown in the figure. +RCU uses the dynticks related fields in the rcu_data structure +to track which CPUs are in dyntick idle mode.

    Kernels built with CONFIG_PREEMPT_RCU support rcu_preempt in addition to rcu_sched and rcu_bh, as shown below: @@ -216,9 +206,6 @@ its own synchronization:

  • Each rcu_node structure has a spinlock.
  • The fields in rcu_data are private to the corresponding CPU, although a few can be read and written by other CPUs. -
  • Similarly, the fields in rcu_dynticks are private - to the corresponding CPU, although a few can be read by - other CPUs.

    It is important to note that different data structures can have @@ -274,11 +261,6 @@ follows: access to this information from the corresponding CPU. Finally, this structure records past dyntick-idle state for the corresponding CPU and also tracks statistics. -

  • rcu_dynticks: - This per-CPU structure tracks the current dyntick-idle - state for the corresponding CPU. - Unlike the other three structures, the rcu_dynticks - structure is not replicated per RCU flavor.
  • rcu_head: This structure represents RCU callbacks, and is the only structure allocated and managed by RCU users. @@ -289,8 +271,8 @@ follows:

    If all you wanted from this article was a general notion of how RCU's data structures are related, you are done. Otherwise, each of the following sections give more details on -the rcu_state, rcu_node, rcu_data, -and rcu_dynticks data structures. +the rcu_state, rcu_node and rcu_data data +structures.

    The rcu_state Structure

    @@ -1017,30 +999,19 @@ as follows:
       1   int cpu;
    -  2   struct rcu_state *rsp;
    -  3   struct rcu_node *mynode;
    -  4   struct rcu_dynticks *dynticks;
    -  5   unsigned long grpmask;
    -  6   bool beenonline;
    +  2   struct rcu_node *mynode;
    +  3   unsigned long grpmask;
    +  4   bool beenonline;
     

    The ->cpu field contains the number of the -corresponding CPU, the ->rsp pointer references -the corresponding rcu_state structure (and is most frequently -used to locate the name of the corresponding flavor of RCU for tracing), -and the ->mynode field references the corresponding -rcu_node structure. +corresponding CPU and the ->mynode field references the +corresponding rcu_node structure. The ->mynode is used to propagate quiescent states up the combining tree. -

    The ->dynticks pointer references the -rcu_dynticks structure corresponding to this -CPU. -Recall that a single per-CPU instance of the rcu_dynticks -structure is shared among all flavors of RCU. -These first four fields are constant and therefore require not -synchronization. +These two fields are constant and therefore do not require synchronization. -

    The ->grpmask field indicates the bit in +

    The ->grpmask field indicates the bit in the ->mynode->qsmask corresponding to this rcu_data structure, and is also used when propagating quiescent states. @@ -1181,26 +1152,22 @@ Finally, the ->dynticks_fqs field is used to count the number of times this CPU is determined to be in dyntick-idle state, and is used for tracing and debugging purposes. -

    -The rcu_dynticks Structure

    - -

    The rcu_dynticks maintains the per-CPU dyntick-idle state -for the corresponding CPU. -Unlike the other structures, rcu_dynticks is not -replicated over the different flavors of RCU. -The fields in this structure may be accessed only from the corresponding -CPU (and from tracing) unless otherwise stated. -Its fields are as follows: +

    +This portion of the rcu_data structure is declared as follows:

       1   long dynticks_nesting;
       2   long dynticks_nmi_nesting;
       3   atomic_t dynticks;
       4   bool rcu_need_heavy_qs;
    -  5   unsigned long rcu_qs_ctr;
    -  6   bool rcu_urgent_qs;
    +  5   bool rcu_urgent_qs;
     
    +

    These fields in the rcu_data structure maintain the per-CPU dyntick-idle +state for the corresponding CPU. +The fields may be accessed only from the corresponding CPU (and from tracing) +unless otherwise stated. +

    The ->dynticks_nesting field counts the nesting depth of process execution, so that in normal circumstances this counter has value zero or one. @@ -1242,19 +1209,12 @@ it is willing to call for heavy-weight dyntick-counter operations. This flag is checked by RCU's context-switch and cond_resched() code, which provide a momentary idle sojourn in response. -

    The ->rcu_qs_ctr field is used to record -quiescent states from cond_resched(). -Because cond_resched() can execute quite frequently, this -must be quite lightweight, as in a non-atomic increment of this -per-CPU field. -

    Finally, the ->rcu_urgent_qs field is used to record -the fact that the RCU core code would really like to see a quiescent -state from the corresponding CPU, with the various other fields indicating -just how badly RCU wants this quiescent state. -This flag is checked by RCU's context-switch and cond_resched() -code, which, if nothing else, non-atomically increment ->rcu_qs_ctr -in response. +the fact that the RCU core code would really like to see a quiescent state from +the corresponding CPU, with the various other fields indicating just how badly +RCU wants this quiescent state. +This flag is checked by RCU's context-switch path +(rcu_note_context_switch) and the cond_resched code. @@ -1431,7 +1391,7 @@ So each flavor of RCU is represented by an rcu_state structure, which contains a combining tree of rcu_node and rcu_data structures. Finally, in CONFIG_NO_HZ_IDLE kernels, each CPU's dyntick-idle -state is tracked by an rcu_dynticks structure. +state is tracked by dynticks-related fields in the rcu_data structure. If you made it this far, you are well prepared to read the code walkthroughs in the other articles in this series. -- cgit v1.2.3 From b54d9db26031d6dc96222164092eacbaa0329255 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 3 Oct 2018 17:40:28 -0700 Subject: doc: rcu: Update Data-Structures for RCU flavor consolidation This patch updates all Data-Structures document figures and text and removes some unwanted figures, to reflect the recent work Paul has been doing with consolidating all flavors of RCU. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../Design/Data-Structures/BigTreeClassicRCUBH.svg | 499 ------------ .../Data-Structures/BigTreePreemptRCUBHdyntick.svg | 741 ------------------ .../BigTreePreemptRCUBHdyntickCB.svg | 834 ++++++++------------- .../Design/Data-Structures/Data-Structures.html | 49 +- .../RCU/Design/Data-Structures/blkd_task.svg | 676 ++++++----------- 5 files changed, 559 insertions(+), 2240 deletions(-) delete mode 100644 Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg delete mode 100644 Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg deleted file mode 100644 index 9bbb1944f962..000000000000 --- a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg +++ /dev/null @@ -1,499 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rcu_bh - - struct - - rcu_node - - struct - - rcu_node - - rcu_node - - struct - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct rcu_state - - rcu_sched - - - - - - - - - - - diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg deleted file mode 100644 index 15adcac036c7..000000000000 --- a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg +++ /dev/null @@ -1,741 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rcu_bh - - struct - - rcu_node - - struct - - rcu_node - - rcu_node - - struct - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct rcu_state - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - rcu_preempt - - rcu_sched - - - - - diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg index bbc3801470d0..3a1a4f85dc3a 100644 --- a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg @@ -13,12 +13,12 @@ xmlns="http://www.w3.org/2000/svg" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="7.4in" - height="9.9in" - viewBox="-44 -44 8938 11938" + width="7.4000001in" + height="7.9000001in" + viewBox="-44 -44 8938 9526.283" id="svg2" version="1.1" - inkscape:version="0.48.4 r9939" + inkscape:version="0.92.2pre0 (973e216, 2017-07-25)" sodipodi:docname="BigTreePreemptRCUBHdyntickCB.svg"> @@ -37,15 +37,46 @@ + + + + + + + style="overflow:visible"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.00000003pt" + transform="matrix(-0.4,0,0,-0.4,-4,0)" + inkscape:connector-curvature="0" /> + style="fill:none;stroke-width:0.025in" + id="g4" + transform="translate(0,-2415.6743)"> - - - - - - - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline20" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline24" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline28" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline32" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline36" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline40" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline58" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline62" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - - - - - - + - + - + - + - + - + - + - + - + + style="stroke:#000000;stroke-width:30;stroke-linecap:butt;stroke-linejoin:miter;marker-end:url(#Arrow1Mend)" + id="polyline108" + transform="matrix(1,0,0,0.95854605,-604.69715,525.62477)" /> + style="stroke:#000000;stroke-width:30;stroke-linecap:butt;stroke-linejoin:miter;marker-end:url(#Arrow1Mend)" + id="polyline114" + transform="matrix(1,0,0,0.95854605,-604.69715,525.62477)" /> - - - - struct + id="text140" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_head + id="text142" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_head struct + id="text144" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_head + id="text146" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_head struct + id="text148" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_head + id="text150" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_head rcu_sched + id="text152" + style="font-style:normal;font-weight:normal;font-size:187.978302px;font-family:Helvetica;text-anchor:end;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_state - rcu_bh struct + id="text156" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_node + id="text158" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_node struct + id="text160" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_node + id="text162" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_node rcu_node + id="text164" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_node struct + id="text166" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct struct + id="text168" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_data + id="text170" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_data struct + id="text172" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_data + id="text174" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_data struct + id="text176" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_data + id="text178" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_data struct + id="text180" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_data + id="text182" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_data struct rcu_state + id="text184" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:start;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_state - struct - rcu_dynticks - struct - rcu_dynticks - struct - rcu_dynticks - struct - rcu_dynticks - rcu_preempt + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline204" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 4eb603e3a005..28b241074c86 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -154,36 +154,9 @@ on that root rcu_node structure remains acceptably low. keeping lock contention under control at all tree levels regardless of the level of loading on the system. -

    The Linux kernel actually supports multiple flavors of RCU -running concurrently, so RCU builds separate data structures for each -flavor. -For example, for CONFIG_TREE_RCU=y kernels, RCU provides -rcu_sched and rcu_bh, as shown below: - -

    BigTreeClassicRCUBH.svg - -

    Energy efficiency is increasingly important, and for that -reason the Linux kernel provides CONFIG_NO_HZ_IDLE, which -turns off the scheduling-clock interrupts on idle CPUs, which in -turn allows those CPUs to attain deeper sleep states and to consume -less energy. -CPUs whose scheduling-clock interrupts have been turned off are -said to be in dyntick-idle mode. -RCU must handle dyntick-idle CPUs specially -because RCU would otherwise wake up each CPU on every grace period, -which would defeat the whole purpose of CONFIG_NO_HZ_IDLE. -RCU uses the dynticks related fields in the rcu_data structure -to track which CPUs are in dyntick idle mode. - -

    Kernels built with CONFIG_PREEMPT_RCU support -rcu_preempt in addition to rcu_sched and rcu_bh, as shown below: - -

    BigTreePreemptRCUBHdyntick.svg -

    RCU updaters wait for normal grace periods by registering RCU callbacks, either directly via call_rcu() and friends (namely call_rcu_bh() and call_rcu_sched()), -there being a separate interface per flavor of RCU) or indirectly via synchronize_rcu() and friends. RCU callbacks are represented by rcu_head structures, which are queued on rcu_data structures while they are @@ -278,7 +251,7 @@ structures. The rcu_state Structure

    The rcu_state structure is the base structure that -represents a flavor of RCU. +represents the state of RCU in the system. This structure forms the interconnection between the rcu_node and rcu_data structures, tracks grace periods, contains the lock used to @@ -373,7 +346,7 @@ sequence number. The bottom two bits are the state of the current grace period, which can be zero for not yet started or one for in progress. In other words, if the bottom two bits of ->gp_seq are -zero, the corresponding flavor of RCU is idle. +zero, then RCU is idle. Any other value in the bottom two bits indicates that something is broken. This field is protected by the root rcu_node structure's ->lock field. @@ -403,10 +376,10 @@ as follows: grace period in jiffies. It is protected by the root rcu_node's ->lock. -

    The ->name field points to the name of the RCU flavor -(for example, “rcu_sched”), and is constant. -The ->abbr field contains a one-character abbreviation, -for example, “s” for RCU-sched. +

    The ->name and ->abbr fields distinguish +between preemptible RCU (“rcu_preempt” and “p”) +and non-preemptible RCU (“rcu_sched” and “s”). +These fields are used for diagnostic and tracing purposes.

    The rcu_node Structure

    @@ -972,8 +945,7 @@ of rcu_barrier().

    The rcu_data Structure

    -

    The rcu_data maintains the per-CPU state for the -corresponding flavor of RCU. +

    The rcu_data maintains the per-CPU state for the RCU subsystem. The fields in this structure may be accessed only from the corresponding CPU (and from tracing) unless otherwise stated. This structure is the @@ -1030,7 +1002,6 @@ as follows: 3 bool cpu_no_qs; 4 bool core_needs_qs; 5 bool gpwrap; - 6 unsigned long rcu_qs_ctr_snap;

    The ->gp_seq and ->gp_seq_needed @@ -1076,10 +1047,6 @@ CPU has remained idle for so long that the gp_seq counter is in danger of overflow, which will cause the CPU to disregard the values of its counters on its next exit from idle. -Finally, the rcu_qs_ctr_snap field is used to detect -cases where a given operation has resulted in a quiescent state -for all flavors of RCU, for example, cond_resched() -when RCU has indicated a need for quiescent states.

    RCU Callback Handling
    @@ -1387,7 +1354,7 @@ the last part of the array, thus traversing only the leaf

    Summary

    -So each flavor of RCU is represented by an rcu_state structure, +So the state of RCU is represented by an rcu_state structure, which contains a combining tree of rcu_node and rcu_data structures. Finally, in CONFIG_NO_HZ_IDLE kernels, each CPU's dyntick-idle diff --git a/Documentation/RCU/Design/Data-Structures/blkd_task.svg b/Documentation/RCU/Design/Data-Structures/blkd_task.svg index 00e810bb8419..bed13e9ecab8 100644 --- a/Documentation/RCU/Design/Data-Structures/blkd_task.svg +++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg @@ -14,12 +14,12 @@ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" width="10.1in" - height="8.6in" - viewBox="-44 -44 12088 10288" + height="6.5999999in" + viewBox="-44 -44 12088 7895.4414" id="svg2" version="1.1" - inkscape:version="0.48.4 r9939" - sodipodi:docname="blkd_task.fig"> + inkscape:version="0.92.2pre0 (973e216, 2017-07-25)" + sodipodi:docname="blkd_task.svg"> @@ -37,15 +37,16 @@ + style="overflow:visible"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.00000003pt" + transform="matrix(-0.4,0,0,-0.4,-4,0)" + inkscape:connector-curvature="0" /> + inkscape:cx="456.40569" + inkscape:cy="348.88682" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" + inkscape:current-layer="g4" + showguides="false" /> + style="fill:none;stroke-width:0.025in" + id="g4" + transform="translate(0,-2393.6637)"> - - - - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline14" + transform="translate(23.757862,2185.7233)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline18" + transform="translate(23.757862,2185.7233)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline22" + transform="translate(23.757862,2185.7233)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline26" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline40" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline44" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline48" + transform="translate(23.757862,2185.7233)" /> - - - - - - - - + points="7350,2850 7350,5100 5550,4350 5550,3450 " + style="fill:#ffbfbf;stroke:#000000;stroke-width:14;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:120, 120" + id="polygon106" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline108" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline114" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline120" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline126" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline130" + transform="translate(23.757862,2185.7233)" /> - rcu_bh struct + id="text136" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_node + id="text138" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_node struct + id="text140" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_node + id="text142" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_node struct + id="text144" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_data + id="text146" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_data struct + id="text148" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_data + id="text150" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_data struct + id="text152" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_data + id="text154" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_data struct + id="text156" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_data + id="text158" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_data struct rcu_state + id="text160" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:start;fill:#000000">struct rcu_state - struct - rcu_dynticks - struct - rcu_dynticks - struct - rcu_dynticks - struct - rcu_dynticks rcu_sched + id="text178" + style="font-style:normal;font-weight:normal;font-size:192px;font-family:Helvetica;text-anchor:end;fill:#000000">rcu_state T3 + id="text180" + style="font-style:normal;font-weight:normal;font-size:216px;font-family:Helvetica;text-anchor:middle;fill:#000000">T3 T2 + id="text182" + style="font-style:normal;font-weight:normal;font-size:216px;font-family:Helvetica;text-anchor:middle;fill:#000000">T2 T1 + id="text184" + style="font-style:normal;font-weight:normal;font-size:216px;font-family:Helvetica;text-anchor:middle;fill:#000000">T1 + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline186" + transform="translate(23.757862,2185.7233)" /> rcu_node + id="text198" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_node struct + id="text200" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct blkd_tasks + id="text202" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:start;fill:#000000">blkd_tasks gp_tasks + id="text204" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:start;fill:#000000">gp_tasks exp_tasks + id="text206" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:start;fill:#000000">exp_tasks -- cgit v1.2.3 From 82eccec851478e55bfd398d7e9d03300026fc4a9 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 25 Sep 2018 11:26:00 -0700 Subject: doc: rcu: Better clarify the rcu_segcblist ->len field An important note under the rcu_segcblist description could use a more detailed description. Especially explanation of the scenario where the ->head field may be temporarily NULL making it not wise to rely on it to determine if callbacks are associated with the rcu_segcblist. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../Design/Data-Structures/Data-Structures.html | 23 ++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 28b241074c86..3ed5f0182bc4 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -928,17 +928,24 @@ this rcu_segcblist structure, not the ->head pointer. The reason for this is that all the ready-to-invoke callbacks (that is, those in the RCU_DONE_TAIL segment) are extracted -all at once at callback-invocation time. +all at once at callback-invocation time (rcu_do_batch), due +to which ->head may be set to NULL if there are no not-done +callbacks remaining in the rcu_segcblist. If callback invocation must be postponed, for example, because a high-priority process just woke up on this CPU, then the remaining -callbacks are placed back on the RCU_DONE_TAIL segment. -Either way, the ->len and ->len_lazy counts -are adjusted after the corresponding callbacks have been invoked, and so -again it is the ->len count that accurately reflects whether -or not there are callbacks associated with this rcu_segcblist -structure. +callbacks are placed back on the RCU_DONE_TAIL segment and +->head once again points to the start of the segment. +In short, the head field can briefly be NULL even though the +CPU has callbacks present the entire time. +Therefore, it is not appropriate to test the ->head pointer +for NULL. + +

    In contrast, the ->len and ->len_lazy counts +are adjusted only after the corresponding callbacks have been invoked. +This means that the ->len count is zero only if +the rcu_segcblist structure really is devoid of callbacks. Of course, off-CPU sampling of the ->len count requires -the use of appropriate synchronization, for example, memory barriers. +careful use of appropriate synchronization, for example, memory barriers. This synchronization can be a bit subtle, particularly in the case of rcu_barrier(). -- cgit v1.2.3 From 70f0508caba2ccb564337e7a2ac4816b094abc00 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 25 Sep 2018 11:26:01 -0700 Subject: doc: rcu: Update description of gp_seq fields in rcu_data The rcu_state structure doesn't have a gp_seq_needed field. Update the description under rcu_data accordingly, to reflect this. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Data-Structures/Data-Structures.html | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 3ed5f0182bc4..18f179807563 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1011,9 +1011,10 @@ as follows: 5 bool gpwrap; -

    The ->gp_seq and ->gp_seq_needed -fields are the counterparts of the fields of the same name -in the rcu_state and rcu_node structures. +

    The ->gp_seq field is the counterpart of the field of the same +name in the rcu_state and rcu_node structures. The +->gp_seq_needed field is the counterpart of the field of the same +name in the rcu_node structure. They may each lag up to one behind their rcu_node counterparts, but in CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_FULL kernels can lag -- cgit v1.2.3 From ed8f6fb247785d98ffe6babcf93b7bedd2c88fd8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 08:38:54 -0700 Subject: doc: Document rcutorture forward-progress test kernel parameters Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 81d1d5a74728..3823679deea5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3773,6 +3773,23 @@ Set wait time between force_quiescent_state bursts in seconds. + rcutorture.fwd_progress= [KNL] + Enable RCU grace-period forward-progress testing + for the types of RCU supporting this notion. + + rcutorture.fwd_progress_div= [KNL] + Specify the fraction of a CPU-stall-warning + period to do tight-loop forward-progress testing. + + rcutorture.fwd_progress_holdoff= [KNL] + Number of seconds to wait between successive + forward-progress tests. + + rcutorture.fwd_progress_need_resched= [KNL] + Enclose cond_resched() calls within checks for + need_resched() during tight-loop forward-progress + testing. + rcutorture.gp_cond= [KNL] Use conditional/asynchronous update-side primitives, if available. -- cgit v1.2.3 From 3398496483df3508764d24917deaa8ab5176969e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:10 -0700 Subject: doc: rcu: Update core and full API in whatisRCU RCU consolidation effort causes the update side of the RCU API to be consistent across all the 3 RCU flavors (normal, sched, bh). This commit therefore updates the full API in the whatisRCU document, thus encouraging people to use the consolidated RCU update API instead of the old RCU-bh and RCU-sched update APIs. Also rcu_dereference is documented to be the same for all 3 mechanisms (even before the consolidation), however its actually different - as using the right rcu_dereference primitive (such as rcu_dereference_bh for bh) is needed to make lock debugging work correctly. This update also corrects that. Also, add local_bh_disable() and local_bh_enable() as softirq protection primitives and correct a grammar error in a quiz answer. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 55 +++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 86d82f7f3500..7c33445fd0e5 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -322,28 +322,27 @@ to their callers and (2) call_rcu() callbacks may be invoked. Efficient implementations of the RCU infrastructure make heavy use of batching in order to amortize their overhead over many uses of the corresponding APIs. -There are no fewer than three RCU mechanisms in the Linux kernel; the -diagram above shows the first one, which is by far the most commonly used. -The rcu_dereference() and rcu_assign_pointer() primitives are used for -all three mechanisms, but different defer and protect primitives are -used as follows: +There are at least three flavors of RCU usage in the Linux kernel. The diagram +above shows the most common one. On the updater side, the rcu_assign_pointer(), +sychronize_rcu() and call_rcu() primitives used are the same for all three +flavors. However for protection (on the reader side), the primitives used vary +depending on the flavor: - Defer Protect +a. rcu_read_lock() / rcu_read_unlock() + rcu_dereference() -a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock() - call_rcu() rcu_dereference() +b. rcu_read_lock_bh() / rcu_read_unlock_bh() + local_bh_disable() / local_bh_enable() + rcu_dereference_bh() -b. synchronize_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh() - call_rcu_bh() rcu_dereference_bh() +c. rcu_read_lock_sched() / rcu_read_unlock_sched() + preempt_disable() / preempt_enable() + local_irq_save() / local_irq_restore() + hardirq enter / hardirq exit + NMI enter / NMI exit + rcu_dereference_sched() -c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched() - call_rcu_sched() preempt_disable() / preempt_enable() - local_irq_save() / local_irq_restore() - hardirq enter / hardirq exit - NMI enter / NMI exit - rcu_dereference_sched() - -These three mechanisms are used as follows: +These three flavors are used as follows: a. RCU applied to normal data structures. @@ -867,18 +866,20 @@ RCU: Critical sections Grace period Barrier bh: Critical sections Grace period Barrier - rcu_read_lock_bh call_rcu_bh rcu_barrier_bh - rcu_read_unlock_bh synchronize_rcu_bh - rcu_dereference_bh synchronize_rcu_bh_expedited + rcu_read_lock_bh call_rcu rcu_barrier + rcu_read_unlock_bh synchronize_rcu + [local_bh_disable] synchronize_rcu_expedited + [and friends] + rcu_dereference_bh rcu_dereference_bh_check rcu_dereference_bh_protected rcu_read_lock_bh_held sched: Critical sections Grace period Barrier - rcu_read_lock_sched synchronize_sched rcu_barrier_sched - rcu_read_unlock_sched call_rcu_sched - [preempt_disable] synchronize_sched_expedited + rcu_read_lock_sched call_rcu rcu_barrier + rcu_read_unlock_sched synchronize_rcu + [preempt_disable] synchronize_rcu_expedited [and friends] rcu_read_lock_sched_notrace rcu_read_unlock_sched_notrace @@ -890,8 +891,8 @@ sched: Critical sections Grace period Barrier SRCU: Critical sections Grace period Barrier - srcu_read_lock synchronize_srcu srcu_barrier - srcu_read_unlock call_srcu + srcu_read_lock call_srcu srcu_barrier + srcu_read_unlock synchronize_srcu srcu_dereference synchronize_srcu_expedited srcu_dereference_check srcu_read_lock_held @@ -1034,7 +1035,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock spinlocks blocking while in RCU read-side critical sections. - Why the apparent inconsistency? Because it is it + Why the apparent inconsistency? Because it is possible to use priority boosting to keep the RCU grace periods short if need be (for example, if running short of memory). In contrast, if blocking waiting -- cgit v1.2.3 From 090c1685fd628a8c191d77b5267a7dc226246a5b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:11 -0700 Subject: doc: rcu: Add more rationale for using rcu_read_lock_sched in checklist This commit explains why rcu_read_lock_sched is better than using preempt_disable. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 49747717d905..8860ab2a897a 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -63,7 +63,7 @@ over a rather long period of time, but improvements are always welcome! pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(), rcu_read_lock_sched(), or by the appropriate update-side lock. Disabling of preemption can serve as rcu_read_lock_sched(), but - is less readable. + is less readable and prevents lockdep from detecting locking issues. Letting RCU-protected pointers "leak" out of an RCU read-side critical section is every bid as bad as letting them leak out -- cgit v1.2.3 From bc2072c9adebd6ed1a192ed55ae195d8fb415c8d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:12 -0700 Subject: doc: rcu: Remove obsolete suggestion from checklist call_rcu_bh is now implemented in terms of call_rcu, so the suggestion to use a different API for speed benefits is not accurate anymore. This commit updates the document accordingly. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 8860ab2a897a..cc22ce49618d 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -285,11 +285,7 @@ over a rather long period of time, but improvements are always welcome! here is that superuser already has lots of ways to crash the machine. - d. Use call_rcu_bh() rather than call_rcu(), in order to take - advantage of call_rcu_bh()'s faster grace periods. (This - is only a partial solution, though.) - - e. Periodically invoke synchronize_rcu(), permitting a limited + d. Periodically invoke synchronize_rcu(), permitting a limited number of updates per grace period. The same cautions apply to call_rcu_bh(), call_rcu_sched(), -- cgit v1.2.3 From e060a03a1c9288f169297c7461ae1e4790b6c53a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:13 -0700 Subject: doc: rcu: Remove obsolete checklist item about synchronize_rcu usage Since the RCU mechanisms have been consolidated, the checklist item warning that synchronize_rcu() waits only for RCU readers is obsolete. This commit therefore removes this checklist item. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index cc22ce49618d..b90ad1b0665a 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -320,37 +320,14 @@ over a rather long period of time, but improvements are always welcome! will break Alpha, cause aggressive compilers to generate bad code, and confuse people trying to read your code. -11. Note that synchronize_rcu() -only- guarantees to wait until - all currently executing rcu_read_lock()-protected RCU read-side - critical sections complete. It does -not- necessarily guarantee - that all currently running interrupts, NMIs, preempt_disable() - code, or idle loops will complete. Therefore, if your - read-side critical sections are protected by something other - than rcu_read_lock(), do -not- use synchronize_rcu(). - - Similarly, disabling preemption is not an acceptable substitute - for rcu_read_lock(). Code that attempts to use preemption - disabling where it should be using rcu_read_lock() will break - in CONFIG_PREEMPT=y kernel builds. - - If you want to wait for interrupt handlers, NMI handlers, and - code under the influence of preempt_disable(), you instead - need to use synchronize_irq() or synchronize_sched(). - - This same limitation also applies to synchronize_rcu_bh() - and synchronize_srcu(), as well as to the asynchronous and - expedited forms of the three primitives, namely call_rcu(), - call_rcu_bh(), call_srcu(), synchronize_rcu_expedited(), - synchronize_rcu_bh_expedited(), and synchronize_srcu_expedited(). - -12. Any lock acquired by an RCU callback must be acquired elsewhere +11. Any lock acquired by an RCU callback must be acquired elsewhere with softirq disabled, e.g., via spin_lock_irqsave(), spin_lock_bh(), etc. Failing to disable irq on a given acquisition of that lock will result in deadlock as soon as the RCU softirq handler happens to run your RCU callback while interrupting that acquisition's critical section. -13. RCU callbacks can be and are executed in parallel. In many cases, +12. RCU callbacks can be and are executed in parallel. In many cases, the callback code simply wrappers around kfree(), so that this is not an issue (or, more accurately, to the extent that it is an issue, the memory-allocator locking handles it). However, @@ -366,7 +343,7 @@ over a rather long period of time, but improvements are always welcome! not the case, a self-spawning RCU callback would prevent the victim CPU from ever going offline.) -14. Unlike other forms of RCU, it -is- permissible to block in an +13. Unlike other forms of RCU, it -is- permissible to block in an SRCU read-side critical section (demarked by srcu_read_lock() and srcu_read_unlock()), hence the "SRCU": "sleepable RCU". Please note that if you don't need to sleep in read-side critical @@ -410,7 +387,7 @@ over a rather long period of time, but improvements are always welcome! Note that rcu_dereference() and rcu_assign_pointer() relate to SRCU just as they do to other forms of RCU. -15. The whole point of call_rcu(), synchronize_rcu(), and friends +14. The whole point of call_rcu(), synchronize_rcu(), and friends is to wait until all pre-existing readers have finished before carrying out some otherwise-destructive operation. It is therefore critically important to -first- remove any path @@ -422,13 +399,13 @@ over a rather long period of time, but improvements are always welcome! is the caller's responsibility to guarantee that any subsequent readers will execute safely. -16. The various RCU read-side primitives do -not- necessarily contain +15. The various RCU read-side primitives do -not- necessarily contain memory barriers. You should therefore plan for the CPU and the compiler to freely reorder code into and out of RCU read-side critical sections. It is the responsibility of the RCU update-side primitives to deal with this. -17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the +16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the __rcu sparse checks to validate your RCU code. These can help find problems as follows: @@ -451,7 +428,7 @@ over a rather long period of time, but improvements are always welcome! These debugging aids can help you find problems that are otherwise extremely difficult to spot. -18. If you register a callback using call_rcu(), call_rcu_bh(), +17. If you register a callback using call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(), and pass in a function defined within a loadable module, then it in necessary to wait for all pending callbacks to be invoked after the last invocation -- cgit v1.2.3 From 1c7d6d4411a1ce7530cbdc4605261c560e07d51a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:14 -0700 Subject: doc: rcu: Encourage use of rcu_barrier in checklist The checklist suggests rcu_barrier_bh() for RCU-bh and similarly for sched, however these APIs are now implemented as rcu_barrier() itself due to the RCU consolidation. This commit therefore corrects checklist.txt to encourage use of the underlying rcu_barrier() API. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index b90ad1b0665a..6f469864d9f5 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -442,8 +442,8 @@ over a rather long period of time, but improvements are always welcome! You instead need to use one of the barrier functions: o call_rcu() -> rcu_barrier() - o call_rcu_bh() -> rcu_barrier_bh() - o call_rcu_sched() -> rcu_barrier_sched() + o call_rcu_bh() -> rcu_barrier() + o call_rcu_sched() -> rcu_barrier() o call_srcu() -> srcu_barrier() However, these barrier functions are absolutely -not- guaranteed -- cgit v1.2.3 From 93eb14201fc690687c2d94865bc38c1aa23356b8 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 8 Oct 2018 18:33:41 -0700 Subject: doc: Make reader aware of rcu_dereference_protected The whatisRCU.txt document says rcu_dereference() cannot be used outside of rcu_read_lock() protected sections. The commit adds a mention of rcu_dereference_protected(), so that the new reader knows that this API can be used to avoid update-side use of rcu_read_lock() and rcu_read_unlock(). Cc: tytso@mit.edu Suggested-by: tytso@mit.edu Signed-off-by: Joel Fernandes (Google) [ paulmck: Update wording, including further feedback from Joel. ] Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 7c33445fd0e5..4a6854318b17 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -266,7 +266,7 @@ rcu_dereference() unnecessary overhead on Alpha CPUs. Note that the value returned by rcu_dereference() is valid - only within the enclosing RCU read-side critical section. + only within the enclosing RCU read-side critical section [1]. For example, the following is -not- legal: rcu_read_lock(); @@ -292,6 +292,19 @@ rcu_dereference() typically used indirectly, via the _rcu list-manipulation primitives, such as list_for_each_entry_rcu(). + [1] The variant rcu_dereference_protected() can be used outside + of an RCU read-side critical section as long as the usage is + protected by locks acquired by the update-side code. This variant + avoids the lockdep warning that would happen when using (for + example) rcu_dereference() without rcu_read_lock() protection. + Using rcu_dereference_protected() also has the advantage + of permitting compiler optimizations that rcu_dereference() + must prohibit. The rcu_dereference_protected() variant takes + a lockdep expression to indicate which locks must be acquired + by the caller. If the indicated protection is not provided, + a lockdep splat is emitted. See RCU/Design/Requirements.html + and the API's code comments for more details and example usage. + The following diagram shows how each API communicates among the reader, updater, and reclaimer. -- cgit v1.2.3 From 8b9df28d7f2e50dc1be758e98dad61ec77d6f6b5 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 14 Oct 2018 14:29:55 -0700 Subject: doc: Remove obsolete (non-)requirement about disabling preemption The Requirements.html document says "Disabling Preemption Does Not Block Grace Periods". However this is no longer true with the RCU consolidation. This commit therefore removes the obsolete (non-)requirement entirely. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 50 ---------------------- 1 file changed, 50 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 7efc1c1da7af..4fae55056c1d 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -900,8 +900,6 @@ Except where otherwise noted, these non-guarantees were premeditated. Grace Periods Don't Partition Read-Side Critical Sections

  • Read-Side Critical Sections Don't Partition Grace Periods -
  • - Disabling Preemption Does Not Block Grace Periods

    Readers Impose Minimal Ordering

    @@ -1259,54 +1257,6 @@ of RCU grace periods.
  •  
     
    -

    -Disabling Preemption Does Not Block Grace Periods

    - -

    -There was a time when disabling preemption on any given CPU would block -subsequent grace periods. -However, this was an accident of implementation and is not a requirement. -And in the current Linux-kernel implementation, disabling preemption -on a given CPU in fact does not block grace periods, as Oleg Nesterov -demonstrated. - -

    -If you need a preempt-disable region to block grace periods, you need to add -rcu_read_lock() and rcu_read_unlock(), for example -as follows: - -

    -
    - 1 preempt_disable();
    - 2 rcu_read_lock();
    - 3 do_something();
    - 4 rcu_read_unlock();
    - 5 preempt_enable();
    - 6
    - 7 /* Spinlocks implicitly disable preemption. */
    - 8 spin_lock(&mylock);
    - 9 rcu_read_lock();
    -10 do_something();
    -11 rcu_read_unlock();
    -12 spin_unlock(&mylock);
    -
    -
    - -

    -In theory, you could enter the RCU read-side critical section first, -but it is more efficient to keep the entire RCU read-side critical -section contained in the preempt-disable region as shown above. -Of course, RCU read-side critical sections that extend outside of -preempt-disable regions will work correctly, but such critical sections -can be preempted, which forces rcu_read_unlock() to do -more work. -And no, this is not an invitation to enclose all of your RCU -read-side critical sections within preempt-disable regions, because -doing so would degrade real-time response. - -

    -This non-requirement appeared with preemptible RCU. -

    Parallelism Facts of Life

    -- cgit v1.2.3 From 97949f0176da396c32e7c881cbfbc61642fb1266 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 14 Oct 2018 19:29:42 -0700 Subject: doc: Make listing in RCU perf/scale requirements use rcu_assign_pointer() The code listing under this section has a quick quiz that says line 19 uses rcu_access_pointer, but the code listing itself instead uses rcu_dereference(). This commit therefore makes the code listing match the quick quiz. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 4fae55056c1d..f74a2233865c 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1596,7 +1596,7 @@ used in place of synchronize_rcu() as follows: 16 struct foo *p; 17 18 spin_lock(&gp_lock); -19 p = rcu_dereference(gp); +19 p = rcu_access_pointer(gp); 20 if (!p) { 21 spin_unlock(&gp_lock); 22 return false; -- cgit v1.2.3 From 97562c018135a9d01c59bd3bf95a9458548b79e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Oct 2018 10:54:13 -0700 Subject: doc: RCU scheduler spinlock rcu_read_unlock() restriction remains Given RCU flavor consolidation, when rcu_read_unlock() is invoked with interrupts disabled, the reporting of the corresponding quiescent state is deferred until interrupts are re-enabled. There was therefore some hope that this would allow dropping the restriction against holding scheduler spinlocks across an rcu_read_unlock() without disabling interrupts across the entire corresponding RCU read-side critical section. Unfortunately, the need to quickly provide a quiescent state to expedited grace periods sometimes requires a call to raise_softirq() during rcu_read_unlock() execution. Because raise_softirq() can sometimes acquire the scheduler spinlocks, the restriction must remain in effect. This commit therefore updates the RCU requirements documentation accordingly. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 44 ++++++++++++++-------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index f74a2233865c..9fca73e03a98 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2475,23 +2475,37 @@ for context-switch-heavy CONFIG_NO_HZ_FULL=y workloads, but there is room for further improvement.

    -In the past, it was forbidden to disable interrupts across an -rcu_read_unlock() unless that interrupt-disabled region -of code also included the matching rcu_read_lock(). -Violating this restriction could result in deadlocks involving the -scheduler's runqueue and priority-inheritance spinlocks. -This restriction was lifted when interrupt-disabled calls to -rcu_read_unlock() started deferring the reporting of -the resulting RCU-preempt quiescent state until the end of that +It is forbidden to hold any of scheduler's runqueue or priority-inheritance +spinlocks across an rcu_read_unlock() unless interrupts have been +disabled across the entire RCU read-side critical section, that is, +up to and including the matching rcu_read_lock(). +Violating this restriction can result in deadlocks involving these +scheduler spinlocks. +There was hope that this restriction might be lifted when interrupt-disabled +calls to rcu_read_unlock() started deferring the reporting of +the resulting RCU-preempt quiescent state until the end of the corresponding interrupts-disabled region. -This deferred reporting means that the scheduler's runqueue and -priority-inheritance locks cannot be held while reporting an RCU-preempt -quiescent state, which lifts the earlier restriction, at least from -a deadlock perspective. -Unfortunately, real-time systems using RCU priority boosting may +Unfortunately, timely reporting of the corresponding quiescent state +to expedited grace periods requires a call to raise_softirq(), +which can acquire these scheduler spinlocks. +In addition, real-time systems using RCU priority boosting need this restriction to remain in effect because deferred -quiescent-state reporting also defers deboosting, which in turn -degrades real-time latencies. +quiescent-state reporting would also defer deboosting, which in turn +would degrade real-time latencies. + +

    +In theory, if a given RCU read-side critical section could be +guaranteed to be less than one second in duration, holding a scheduler +spinlock across that critical section's rcu_read_unlock() +would require only that preemption be disabled across the entire +RCU read-side critical section, not interrupts. +Unfortunately, given the possibility of vCPU preemption, long-running +interrupts, and so on, it is not possible in practice to guarantee +that a given RCU read-side critical section will complete in less than +one second. +Therefore, as noted above, if scheduler spinlocks are held across +a given call to rcu_read_unlock(), interrupts must be +disabled across the entire RCU read-side critical section.

    Tracing and RCU

    -- cgit v1.2.3 From a78ad16c7f0f948284d6927be95bc0e31a7b170b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 29 Oct 2018 22:15:59 -0700 Subject: doc: Correct parameter in stallwarn The stallwarn document incorrectly mentions 'fps=' instead of 'fqs='. This commit orrects that. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index b01bcafc64aa..073dbc12d1ea 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -205,7 +205,7 @@ handlers are no longer able to execute on this CPU. This can happen if the stalled CPU is spinning with interrupts are disabled, or, in -rt kernels, if a high-priority process is starving RCU's softirq handler. -The "fps=" shows the number of force-quiescent-state idle/offline +The "fqs=" shows the number of force-quiescent-state idle/offline detection passes that the grace-period kthread has made across this CPU since the last time that this CPU noted the beginning of a grace period. -- cgit v1.2.3 From 97b59370fa5959d5833a54f303f640d094af3d3c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 27 Oct 2018 21:30:46 -0700 Subject: doc: Fix "struction" typo in RCU memory-ordering documentation This commit replaces "struction" with the correct "structure". Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html index a346ce0116eb..e4d94fba6c89 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html @@ -77,7 +77,7 @@ The key point is that the lock-acquisition functions, including smp_mb__after_unlock_lock() immediately after successful acquisition of the lock. -

    Therefore, for any given rcu_node struction, any access +

    Therefore, for any given rcu_node structure, any access happening before one of the above lock-release functions will be seen by all CPUs as happening before any access happening after a later one of the above lock-acquisition functions. -- cgit v1.2.3 From fc6f9c57787e578473d47b7bbc846e317d17c1df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Aug 2018 14:43:05 -0700 Subject: rcutorture: Remove cbflood facility Now that the forward-progress code does a full-bore continuous callback flood lasting multiple seconds, there is little point in also posting a mere 60,000 callbacks every second or so. This commit therefore removes the old cbflood testing. Over time, it may be desirable to concurrently do full-bore continuous callback floods on all CPUs simultaneously, but one dragon at a time. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 18 ------ kernel/rcu/rcutorture.c | 86 +------------------------ 2 files changed, 1 insertion(+), 103 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3823679deea5..86e825e0927a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3743,24 +3743,6 @@ in microseconds. The default of zero says no holdoff. - rcutorture.cbflood_inter_holdoff= [KNL] - Set holdoff time (jiffies) between successive - callback-flood tests. - - rcutorture.cbflood_intra_holdoff= [KNL] - Set holdoff time (jiffies) between successive - bursts of callbacks within a given callback-flood - test. - - rcutorture.cbflood_n_burst= [KNL] - Set the number of bursts making up a given - callback-flood test. Set this to zero to - disable callback-flood testing. - - rcutorture.cbflood_n_per_burst= [KNL] - Set the number of callbacks to be registered - in a given burst of a callback-flood test. - rcutorture.fqs_duration= [KNL] Set duration of force_quiescent_state bursts in microseconds. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8cf700ca7845..17f480129a78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -80,13 +80,6 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett 0 && - cbflood_inter_holdoff > 0 && - cbflood_intra_holdoff > 0 && - cur_ops->call && - cur_ops->cb_barrier) { - rhp = vmalloc(array3_size(cbflood_n_burst, - cbflood_n_per_burst, - sizeof(*rhp))); - err = !rhp; - } - if (err) { - VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - goto wait_for_stop; - } - VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); - do { - schedule_timeout_interruptible(cbflood_inter_holdoff); - atomic_long_inc(&n_cbfloods); - WARN_ON(signal_pending(current)); - for (i = 0; i < cbflood_n_burst; i++) { - for (j = 0; j < cbflood_n_per_burst; j++) { - cur_ops->call(&rhp[i * cbflood_n_per_burst + j], - rcu_torture_cbflood_cb); - } - schedule_timeout_interruptible(cbflood_intra_holdoff); - WARN_ON(signal_pending(current)); - } - cur_ops->cb_barrier(); - stutter_wait("rcu_torture_cbflood"); - } while (!torture_must_stop()); - vfree(rhp); -wait_for_stop: - torture_kthread_stopping("rcu_torture_cbflood"); - return 0; -} - /* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability @@ -1460,11 +1397,10 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld ", + pr_cont("barrier: %ld/%ld:%ld\n", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); - pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || @@ -2093,8 +2029,6 @@ rcu_torture_cleanup(void) cur_ops->name, gp_seq, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); - for (i = 0; i < ncbflooders; i++) - torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if (rcu_torture_can_boost()) cpuhp_remove_state(rcutor_hp); @@ -2377,24 +2311,6 @@ rcu_torture_init(void) goto unwind; if (object_debug) rcu_test_debug_objects(); - if (cbflood_n_burst > 0) { - /* Create the cbflood threads */ - ncbflooders = (num_online_cpus() + 3) / 4; - cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), - GFP_KERNEL); - if (!cbflood_task) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < ncbflooders; i++) { - firsterr = torture_create_kthread(rcu_torture_cbflood, - NULL, - cbflood_task[i]); - if (firsterr) - goto unwind; - } - } torture_init_end(); return 0; -- cgit v1.2.3