From 8e9c01c717df7e05c5bd1ca86aaa3a74b31f37f1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 9 Apr 2021 00:38:59 +0200 Subject: srcu: Initialize SRCU after timers Once srcu_init() is called, the SRCU core will make use of delayed workqueues, which rely on timers. However init_timers() is called several steps after rcu_init(). This means that a call_srcu() after rcu_init() but before init_timers() would find itself within a dangerously uninitialized timer core. This commit therefore creates a separate call to srcu_init() after init_timer() completes, which ensures that we stay in early SRCU mode until timers are safe(r). Signed-off-by: Frederic Weisbecker Cc: Uladzislau Rezki Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/rcu/rcu.h') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bf0827d4b659..ca3f2af32bf8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -422,12 +422,6 @@ do { \ #endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */ -#ifdef CONFIG_SRCU -void srcu_init(void); -#else /* #ifdef CONFIG_SRCU */ -static inline void srcu_init(void) { } -#endif /* #else #ifdef CONFIG_SRCU */ - #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) { return true; } -- cgit v1.2.3 From b5befe842e6612cf894cf4a199924ee872d8b7d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 17 Apr 2021 15:16:49 +0200 Subject: srcu: Fix broken node geometry after early ssp init An srcu_struct structure that is initialized before rcu_init_geometry() will have its srcu_node hierarchy based on CONFIG_NR_CPUS. Once rcu_init_geometry() is called, this hierarchy is compressed as needed for the actual maximum number of CPUs for this system. Later on, that srcu_struct structure is confused, sometimes referring to its initial CONFIG_NR_CPUS-based hierarchy, and sometimes instead to the new num_possible_cpus() hierarchy. For example, each of its ->mynode fields continues to reference the original leaf rcu_node structures, some of which might no longer exist. On the other hand, srcu_for_each_node_breadth_first() traverses to the new node hierarchy. There are at least two bad possible outcomes to this: 1) a) A callback enqueued early on an srcu_data structure (call it *sdp) is recorded pending on sdp->mynode->srcu_data_have_cbs in srcu_funnel_gp_start() with sdp->mynode pointing to a deep leaf (say 3 levels). b) The grace period ends after rcu_init_geometry() shrinks the nodes level to a single one. srcu_gp_end() walks through the new srcu_node hierarchy without ever reaching the old leaves so the callback is never executed. This is easily reproduced on an 8 CPUs machine with CONFIG_NR_CPUS >= 32 and "rcupdate.rcu_self_test=1". The srcu_barrier() after early tests verification never completes and the boot hangs: [ 5413.141029] INFO: task swapper/0:1 blocked for more than 4915 seconds. [ 5413.147564] Not tainted 5.12.0-rc4+ #28 [ 5413.151927] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 5413.159753] task:swapper/0 state:D stack: 0 pid: 1 ppid: 0 flags:0x00004000 [ 5413.168099] Call Trace: [ 5413.170555] __schedule+0x36c/0x930 [ 5413.174057] ? wait_for_completion+0x88/0x110 [ 5413.178423] schedule+0x46/0xf0 [ 5413.181575] schedule_timeout+0x284/0x380 [ 5413.185591] ? wait_for_completion+0x88/0x110 [ 5413.189957] ? mark_held_locks+0x61/0x80 [ 5413.193882] ? mark_held_locks+0x61/0x80 [ 5413.197809] ? _raw_spin_unlock_irq+0x24/0x50 [ 5413.202173] ? wait_for_completion+0x88/0x110 [ 5413.206535] wait_for_completion+0xb4/0x110 [ 5413.210724] ? srcu_torture_stats_print+0x110/0x110 [ 5413.215610] srcu_barrier+0x187/0x200 [ 5413.219277] ? rcu_tasks_verify_self_tests+0x50/0x50 [ 5413.224244] ? rdinit_setup+0x2b/0x2b [ 5413.227907] rcu_verify_early_boot_tests+0x2d/0x40 [ 5413.232700] do_one_initcall+0x63/0x310 [ 5413.236541] ? rdinit_setup+0x2b/0x2b [ 5413.240207] ? rcu_read_lock_sched_held+0x52/0x80 [ 5413.244912] kernel_init_freeable+0x253/0x28f [ 5413.249273] ? rest_init+0x250/0x250 [ 5413.252846] kernel_init+0xa/0x110 [ 5413.256257] ret_from_fork+0x22/0x30 2) An srcu_struct structure that is initialized before rcu_init_geometry() and used afterward will always have stale rdp->mynode references, resulting in callbacks to be missed in srcu_gp_end(), just like in the previous scenario. This commit therefore causes init_srcu_struct_nodes to initialize the geometry, if needed. This ensures that the srcu_node hierarchy is properly built and distributed from the get-go. Suggested-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/srcutree.c | 3 +++ kernel/rcu/tree.c | 16 +++++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/rcu.h') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ca3f2af32bf8..28de768bd875 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -308,6 +308,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } +extern void rcu_init_geometry(void); + /* Returns a pointer to the first leaf rcu_node structure. */ #define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1]) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9ec35c158740..5fdbde687feb 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -90,6 +90,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) struct srcu_node *snp; struct srcu_node *snp_first; + /* Initialize geometry if it has not already been initialized. */ + rcu_init_geometry(); + /* Work out the overall tree geometry. */ ssp->level[0] = &ssp->node[0]; for (i = 1; i < rcu_num_lvls; i++) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c35b15229cff..0420b23fc9d0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4582,11 +4582,25 @@ static void __init rcu_init_one(void) * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. */ -static void __init rcu_init_geometry(void) +void rcu_init_geometry(void) { ulong d; int i; + static unsigned long old_nr_cpu_ids; int rcu_capacity[RCU_NUM_LVLS]; + static bool initialized; + + if (initialized) { + /* + * Warn if setup_nr_cpu_ids() had not yet been invoked, + * unless nr_cpus_ids == NR_CPUS, in which case who cares? + */ + WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids); + return; + } + + old_nr_cpu_ids = nr_cpu_ids; + initialized = true; /* * Initialize any unspecified boot parameters. -- cgit v1.2.3 From 0260b92e1c39412b1e345e202355c43169c16274 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Apr 2021 13:01:14 -0700 Subject: rcutorture: Forgive RCU boost failures when CPUs don't pass through QS Currently, rcu_torture_boost() runs CPU-bound at real-time priority to force RCU priority inversions. It then checks that grace periods progress during this CPU-bound time. If grace periods fail to progress, it reports and RCU priority boosting failure. However, it is possible (and sometimes does happen) that the grace period fails to progress due to a CPU failing to pass through a quiescent state for an extended time period (3.5 seconds by default). This can happen due to vCPU preemption, long-running interrupts, and much else besides. There is nothing that RCU priority boosting can do about these situations, and so they should not be counted as RCU priority boosting failures. This commit therefore checks for CPUs (as opposed to preempted tasks) holding up a grace period, and flags the resulting RCU priority boosting failures, but does not splat nor count them as errors. It does rate-limit them to avoid flooding the console log. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 67 ++++++++++++++++++++++++++++++------------------- kernel/rcu/tree_stall.h | 36 ++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 26 deletions(-) (limited to 'kernel/rcu/rcu.h') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bf0827d4b659..daf0cd3f2926 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -519,6 +519,7 @@ static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } +static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } @@ -527,6 +528,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); +bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 02a14dfcae67..5ae4dcc6ba27 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -340,6 +340,7 @@ struct rcu_torture_ops { void (*fqs)(void); void (*stats)(void); void (*gp_kthread_dbg)(void); + bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); int irq_capable; int can_boost; @@ -483,31 +484,32 @@ static void rcu_sync_torture_init(void) } static struct rcu_torture_ops rcu_ops = { - .ttype = RCU_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .readlock_held = torture_readlock_not_held, - .get_gp_seq = rcu_get_gp_seq, - .gp_diff = rcu_seq_diff, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .exp_sync = synchronize_rcu_expedited, - .get_gp_state = get_state_synchronize_rcu, - .start_gp_poll = start_poll_synchronize_rcu, - .poll_gp_state = poll_state_synchronize_rcu, - .cond_sync = cond_synchronize_rcu, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .gp_kthread_dbg = show_rcu_gp_kthreads, - .stall_dur = rcu_jiffies_till_stall_check, - .irq_capable = 1, - .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), - .extendables = RCUTORTURE_MAX_EXTEND, - .name = "rcu" + .ttype = RCU_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .get_gp_state = get_state_synchronize_rcu, + .start_gp_poll = start_poll_synchronize_rcu, + .poll_gp_state = poll_state_synchronize_rcu, + .cond_sync = cond_synchronize_rcu, + .call = call_rcu, + .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, + .stats = NULL, + .gp_kthread_dbg = show_rcu_gp_kthreads, + .check_boost_failed = rcu_check_boost_fail, + .stall_dur = rcu_jiffies_till_stall_check, + .irq_capable = 1, + .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), + .extendables = RCUTORTURE_MAX_EXTEND, + .name = "rcu" }; /* @@ -918,14 +920,27 @@ static void rcu_torture_enable_rt_throttle(void) static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start, unsigned long end) { + int cpu; static int dbg_done; bool gp_done; + unsigned long j; + static unsigned long last_persist; + unsigned long lp; + unsigned long mininterval = test_boost_duration * HZ - HZ / 2; - if (end - start > test_boost_duration * HZ - HZ / 2) { + if (end - start > mininterval) { // Recheck after checking time to avoid false positives. smp_mb(); // Time check before grace-period check. if (cur_ops->poll_gp_state(gp_state)) return false; // passed, though perhaps just barely + if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) { + // At most one persisted message per boost test. + j = jiffies; + lp = READ_ONCE(last_persist); + if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp) + pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + return false; // passed on a technicality + } VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 59b95cc5cbdf..af92d9fee0d4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -716,6 +716,42 @@ static void check_cpu_stall(struct rcu_data *rdp) // RCU forward-progress mechanisms, including of callback invocation. +/* + * Check to see if a failure to end RCU priority inversion was due to + * a CPU not passing through a quiescent state. When this happens, there + * is nothing that RCU priority boosting can do to help, so we shouldn't + * count this as an RCU priority boosting failure. A return of true says + * RCU priority boosting is to blame, and false says otherwise. If false + * is returned, the first of the CPUs to blame is stored through cpup. + */ +bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rnp->qsmask) { + // No CPUs without quiescent states for this rnp. + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; + } + // Find the first holdout CPU. + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + *cpup = cpu; + return false; + } + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + // Can't blame CPUs, so must blame RCU priority boosting. + return true; +} +EXPORT_SYMBOL_GPL(rcu_check_boost_fail); + /* * Show the state of the grace-period kthreads. */ -- cgit v1.2.3 From 474d0997361c07d163693d0de41e76a2f2899d0a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Apr 2021 10:58:07 -0700 Subject: tasks-rcu: Make show_rcu_tasks_gp_kthreads() be static inline In some architectures, the no-op variant of show_rcu_tasks_gp_kthreads() get "no previous prototype" compiler warnings. These are false positives given that kernel/rcu/tasks.h is included only once. But why put up with the compiler noise? This commit therefore adds "static inline" to this definition to force the compiler to accept this situation, while also moving it to its proper place in kernel/rcu/rcu.h. Reported-by: kernel test robot [ paulmck: Update per Stephen Rothwell feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ++++ kernel/rcu/tasks.h | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/rcu.h') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bf0827d4b659..c0b3ab6736b8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -441,7 +441,11 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +#ifdef CONFIG_TASKS_RCU_GENERIC void show_rcu_tasks_gp_kthreads(void); +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void show_rcu_tasks_gp_kthreads(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d6aa352cd705..fc218539d151 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1401,5 +1401,4 @@ void __init rcu_init_tasks_generic(void) #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} -void show_rcu_tasks_gp_kthreads(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ -- cgit v1.2.3